summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c64
-rw-r--r--fs/9p/fid.c8
-rw-r--r--fs/9p/v9fs.c59
-rw-r--r--fs/9p/vfs_addr.c13
-rw-r--r--fs/9p/vfs_dentry.c12
-rw-r--r--fs/9p/vfs_dir.c13
-rw-r--r--fs/9p/vfs_file.c34
-rw-r--r--fs/9p/vfs_inode.c165
-rw-r--r--fs/9p/vfs_inode_dotl.c127
-rw-r--r--fs/9p/vfs_super.c12
-rw-r--r--fs/9p/xattr.c16
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/aio.c11
-rw-r--r--fs/autofs4/autofs_i.h1
-rw-r--r--fs/autofs4/inode.c1
-rw-r--r--fs/autofs4/waitq.c40
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c13
-rw-r--r--fs/btrfs/disk-io.c5
-rw-r--r--fs/btrfs/file.c5
-rw-r--r--fs/ceph/dir.c76
-rw-r--r--fs/ceph/export.c6
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/super.c16
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/ceph/xattr.c22
-rw-r--r--fs/coda/cnode.c38
-rw-r--r--fs/coda/coda_fs_i.h4
-rw-r--r--fs/coda/dir.c29
-rw-r--r--fs/coda/inode.c10
-rw-r--r--fs/dcache.c26
-rw-r--r--fs/direct-io.c57
-rw-r--r--fs/dlm/config.c130
-rw-r--r--fs/dlm/config.h17
-rw-r--r--fs/dlm/debug_fs.c28
-rw-r--r--fs/dlm/dir.c1
-rw-r--r--fs/dlm/dlm_internal.h60
-rw-r--r--fs/dlm/lock.c87
-rw-r--r--fs/dlm/lockspace.c71
-rw-r--r--fs/dlm/member.c486
-rw-r--r--fs/dlm/member.h10
-rw-r--r--fs/dlm/rcom.c99
-rw-r--r--fs/dlm/rcom.h2
-rw-r--r--fs/dlm/recover.c87
-rw-r--r--fs/dlm/recoverd.c53
-rw-r--r--fs/dlm/user.c5
-rw-r--r--fs/eventpoll.c234
-rw-r--r--fs/exec.c4
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/ext4.h29
-rw-r--r--fs/ext4/extents.c10
-rw-r--r--fs/ext4/ialloc.c18
-rw-r--r--fs/ext4/inode.c143
-rw-r--r--fs/ext4/ioctl.c86
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/resize.c1175
-rw-r--r--fs/ext4/super.c11
-rw-r--r--fs/ext4/xattr_security.c5
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/fuse/dev.c57
-rw-r--r--fs/fuse/dir.c58
-rw-r--r--fs/fuse/file.c58
-rw-r--r--fs/fuse/fuse_i.h10
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/incore.h60
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/lock_dlm.c993
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/ops_fstype.c31
-rw-r--r--fs/gfs2/recovery.c11
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/sys.c33
-rw-r--r--fs/gfs2/sys.h2
-rw-r--r--fs/hfsplus/super.c11
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/revoke.c34
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/client.c12
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/idmap.c83
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4filelayout.c9
-rw-r--r--fs/nfs/nfs4proc.c177
-rw-r--r--fs/nfs/nfs4state.c104
-rw-r--r--fs/nfs/nfs4xdr.c137
-rw-r--r--fs/nfs/objlayout/objio_osd.c3
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h1
-rw-r--r--fs/nfs/super.c43
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c449
-rw-r--r--fs/proc/inode.c18
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/root.c70
-rw-r--r--fs/reiserfs/bitmap.c3
-rw-r--r--fs/reiserfs/journal.c64
-rw-r--r--fs/reiserfs/super.c54
-rw-r--r--fs/squashfs/cache.c30
-rw-r--r--fs/squashfs/inode.c4
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/ubifs/debug.c90
-rw-r--r--fs/ubifs/debug.h44
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/lpt.c6
-rw-r--r--fs/ubifs/replay.c8
-rw-r--r--fs/ubifs/tnc.c58
-rw-r--r--fs/ubifs/tnc_misc.c10
-rw-r--r--fs/ubifs/xattr.c6
121 files changed, 5100 insertions, 1669 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 945aa5f02f9b..a9ea73d6dcf3 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
uint16_t klen = 0;
v9ses = (struct v9fs_session_info *)cookie_netfs_data;
- P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
- buffer, bufmax);
+ p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
+ v9ses, buffer, bufmax);
if (v9ses->cachetag)
klen = strlen(v9ses->cachetag);
@@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
return 0;
memcpy(buffer, v9ses->cachetag, klen);
- P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+ p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
return klen;
}
@@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
&v9fs_cache_session_index_def,
v9ses);
- P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
- v9ses->fscache);
+ p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
+ v9ses, v9ses->fscache);
}
void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
{
- P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
- v9ses->fscache);
+ p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
+ v9ses, v9ses->fscache);
fscache_relinquish_cookie(v9ses->fscache, 0);
v9ses->fscache = NULL;
}
@@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
{
const struct v9fs_inode *v9inode = cookie_netfs_data;
memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
- v9inode->qid.path);
+ p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
+ &v9inode->vfs_inode, v9inode->qid.path);
return sizeof(v9inode->qid.path);
}
@@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
const struct v9fs_inode *v9inode = cookie_netfs_data;
*size = i_size_read(&v9inode->vfs_inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
- *size);
+ p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
+ &v9inode->vfs_inode, *size);
}
static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
@@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
{
const struct v9fs_inode *v9inode = cookie_netfs_data;
memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
- v9inode->qid.version);
+ p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
+ &v9inode->vfs_inode, v9inode->qid.version);
return sizeof(v9inode->qid.version);
}
@@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
&v9fs_cache_inode_index_def,
v9inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
- v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
+ inode, v9inode->fscache);
}
void v9fs_cache_inode_put_cookie(struct inode *inode)
@@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
if (!v9inode->fscache)
return;
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
- v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
+ inode, v9inode->fscache);
fscache_relinquish_cookie(v9inode->fscache, 0);
v9inode->fscache = NULL;
@@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
if (!v9inode->fscache)
return;
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
- v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
+ inode, v9inode->fscache);
fscache_relinquish_cookie(v9inode->fscache, 1);
v9inode->fscache = NULL;
@@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
v9inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
- inode, old, v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
+ inode, old, v9inode->fscache);
spin_unlock(&v9inode->fscache_lock);
}
@@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
int ret;
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
if (!v9inode->fscache)
return -ENOBUFS;
@@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
switch (ret) {
case -ENOBUFS:
case -ENODATA:
- P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+ p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
return 1;
case 0:
- P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
return ret;
default:
- P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
return ret;
}
}
@@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
int ret;
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+ p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
if (!v9inode->fscache)
return -ENOBUFS;
@@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
switch (ret) {
case -ENOBUFS:
case -ENODATA:
- P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+ p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
return 1;
case 0:
BUG_ON(!list_empty(pages));
BUG_ON(*nr_pages != 0);
- P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
return ret;
default:
- P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
return ret;
}
}
@@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
int ret;
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
- P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
+ p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
if (ret != 0)
v9fs_uncache_page(inode, page);
}
@@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
{
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
if (PageFsCache(page))
fscache_wait_on_page_write(v9inode->fscache, page);
}
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 85b67ffa2a43..da8eefbe830d 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
{
struct v9fs_dentry *dent;
- P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
- fid->fid, dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n",
+ fid->fid, dentry->d_name.name);
dent = dentry->d_fsdata;
if (!dent) {
@@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
struct v9fs_dentry *dent;
struct p9_fid *fid, *ret;
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
- dentry->d_name.name, dentry, uid, any);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
+ dentry->d_name.name, dentry, uid, any);
dent = (struct v9fs_dentry *) dentry->d_fsdata;
ret = NULL;
if (dent) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2b78014a124a..1964f98e74be 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,6 +23,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -85,15 +87,15 @@ static int get_cache_mode(char *s)
if (!strcmp(s, "loose")) {
version = CACHE_LOOSE;
- P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n");
+ p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
} else if (!strcmp(s, "fscache")) {
version = CACHE_FSCACHE;
- P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n");
+ p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
} else if (!strcmp(s, "none")) {
version = CACHE_NONE;
- P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n");
+ p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
} else
- printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s);
+ pr_info("Unknown Cache mode %s\n", s);
return version;
}
@@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_debug:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_dfltuid:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_dfltgid:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_afid:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
s = match_strdup(&args[0]);
if (!s) {
ret = -ENOMEM;
- P9_DPRINTK(P9_DEBUG_ERROR,
- "problem allocating copy of cache arg\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "problem allocating copy of cache arg\n");
goto free_and_return;
}
ret = get_cache_mode(s);
@@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
s = match_strdup(&args[0]);
if (!s) {
ret = -ENOMEM;
- P9_DPRINTK(P9_DEBUG_ERROR,
- "problem allocating copy of access arg\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "problem allocating copy of access arg\n");
goto free_and_return;
}
@@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->uid = simple_strtoul(s, &e, 10);
if (*e != '\0') {
ret = -EINVAL;
- printk(KERN_INFO "9p: Unknown access "
- "argument %s.\n", s);
+ pr_info("Unknown access argument %s\n",
+ s);
kfree(s);
goto free_and_return;
}
@@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FS_POSIX_ACL
v9ses->flags |= V9FS_POSIX_ACL;
#else
- P9_DPRINTK(P9_DEBUG_ERROR,
- "Not defined CONFIG_9P_FS_POSIX_ACL. "
- "Ignoring posixacl option\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
#endif
break;
@@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(v9ses->clnt)) {
retval = PTR_ERR(v9ses->clnt);
v9ses->clnt = NULL;
- P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n");
+ p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
goto error;
}
@@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(fid)) {
retval = PTR_ERR(fid);
fid = NULL;
- P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n");
+ p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
goto error;
}
@@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
*/
void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
- P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
+ p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
p9_client_disconnect(v9ses->clnt);
}
@@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
{
- P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+ p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
p9_client_begin_disconnect(v9ses->clnt);
}
@@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void)
static int __init init_v9fs(void)
{
int err;
- printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
+ pr_info("Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
err = register_filesystem(&v9fs_fs_type);
if (err < 0) {
- printk(KERN_ERR "Failed to register filesystem\n");
+ pr_err("Failed to register filesystem\n");
return err;
}
err = v9fs_cache_register();
if (err < 0) {
- printk(KERN_ERR "Failed to register v9fs for caching\n");
+ pr_err("Failed to register v9fs for caching\n");
goto out_fs_unreg;
}
err = v9fs_sysfs_init();
if (err < 0) {
- printk(KERN_ERR "Failed to register with sysfs\n");
+ pr_err("Failed to register with sysfs\n");
goto out_sysfs_cleanup;
}
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 2524e4cbb8ea..0ad61c6a65a5 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
struct inode *inode;
inode = page->mapping->host;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
BUG_ON(!PageLocked(page));
@@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
struct inode *inode;
inode = mapping->host;
- P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+ p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
if (ret == 0)
return ret;
ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
- P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret);
+ p9_debug(P9_DEBUG_VFS, " = %d\n", ret);
return ret;
}
@@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* Now that we do caching with cache mode enabled, We need
* to support direct IO
*/
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
- "off/no(%lld/%lu) EINVAL\n",
- iocb->ki_filp->f_path.dentry->d_name.name,
- (long long) pos, nr_segs);
+ p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
+ iocb->ki_filp->f_path.dentry->d_name.name,
+ (long long)pos, nr_segs);
return -EINVAL;
}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index e022890c6f40..d529437ff442 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -53,8 +53,8 @@
static int v9fs_dentry_delete(const struct dentry *dentry)
{
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
- dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+ dentry->d_name.name, dentry);
return 1;
}
@@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
*/
static int v9fs_cached_dentry_delete(const struct dentry *dentry)
{
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
- dentry->d_name.name, dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+ dentry->d_name.name, dentry);
/* Don't cache negative dentries */
if (!dentry->d_inode)
@@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry)
struct v9fs_dentry *dent;
struct p9_fid *temp, *current_fid;
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
- dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+ dentry->d_name.name, dentry);
dent = dentry->d_fsdata;
if (dent) {
list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 598fff1a54e5..ff911e779651 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
int reclen = 0;
struct p9_rdir *rdir;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
fid = filp->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
rdir->tail - rdir->head, &st);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
p9stat_free(&st);
goto unlock_and_exit;
@@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
struct p9_dirent curdirent;
u64 oldoffset = 0;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
fid = filp->private_data;
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
@@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
rdir->tail - rdir->head,
&curdirent);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
goto unlock_and_exit;
}
@@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
struct p9_fid *fid;
fid = filp->private_data;
- P9_DPRINTK(P9_DEBUG_VFS,
- "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
- inode, filp, fid ? fid->fid : -1);
+ p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
+ inode, filp, fid ? fid->fid : -1);
if (fid)
p9_client_clunk(fid);
return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 62857a810a79..fc06fd27065e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
struct p9_fid *fid;
int omode;
- P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+ p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
v9inode = V9FS_I(inode);
v9ses = v9fs_inode2v9ses(inode);
if (v9fs_proto_dotl(v9ses))
@@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
int res = 0;
struct inode *inode = filp->f_path.dentry->d_inode;
- P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+ p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
break;
- schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+ if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+ break;
}
/* map 9p status to VFS status */
@@ -304,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = -ENOLCK;
- P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
- cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+ filp, cmd, fl, filp->f_path.dentry->d_name.name);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -340,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = -ENOLCK;
- P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
- cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+ filp, cmd, fl, filp->f_path.dentry->d_name.name);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -384,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
{
int n, total, size;
- P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
- (long long unsigned) offset, count);
+ p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n",
+ fid->fid, (long long unsigned)offset, count);
n = 0;
total = 0;
size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -443,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
struct p9_fid *fid;
size_t size;
- P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
+ p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
fid = filp->private_data;
size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -470,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
loff_t origin = *offset;
unsigned long pg_start, pg_end;
- P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
- (int)count, (int)*offset);
+ p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
+ data, (int)count, (int)*offset);
clnt = fid->clnt;
do {
@@ -552,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
return retval;
mutex_lock(&inode->i_mutex);
- P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+ p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
v9fs_blank_wstat(&wstat);
@@ -575,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
return retval;
mutex_lock(&inode->i_mutex);
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
- filp, datasync);
+ p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
@@ -607,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = filp->f_path.dentry->d_inode;
- P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
- page, (unsigned long)filp->private_data);
+ p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
+ page, (unsigned long)filp->private_data);
v9inode = V9FS_I(inode);
/* make sure the cache has finished storing the page */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e0f20de6aa2b..014c8dd62962 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -23,6 +23,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -88,6 +90,32 @@ static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
}
/**
+ * p9mode2perm- convert plan9 mode bits to unix permission bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ *
+ */
+static int p9mode2perm(struct v9fs_session_info *v9ses,
+ struct p9_wstat *stat)
+{
+ int res;
+ int mode = stat->mode;
+
+ res = mode & S_IALLUGO;
+ if (v9fs_proto_dotu(v9ses)) {
+ if ((mode & P9_DMSETUID) == P9_DMSETUID)
+ res |= S_ISUID;
+
+ if ((mode & P9_DMSETGID) == P9_DMSETGID)
+ res |= S_ISGID;
+
+ if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
+ res |= S_ISVTX;
+ }
+ return res;
+}
+
+/**
* p9mode2unixmode- convert plan9 mode bits to unix mode bits
* @v9ses: v9fs session information
* @stat: p9_wstat from which mode need to be derived
@@ -100,8 +128,8 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
int res;
u32 mode = stat->mode;
- res = mode & S_IALLUGO;
*rdev = 0;
+ res = p9mode2perm(v9ses, stat);
if ((mode & P9_DMDIR) == P9_DMDIR)
res |= S_IFDIR;
@@ -128,24 +156,13 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
res |= S_IFBLK;
break;
default:
- P9_DPRINTK(P9_DEBUG_ERROR,
- "Unknown special type %c %s\n", type,
- stat->extension);
+ p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
+ type, stat->extension);
};
*rdev = MKDEV(major, minor);
} else
res |= S_IFREG;
- if (v9fs_proto_dotu(v9ses)) {
- if ((mode & P9_DMSETUID) == P9_DMSETUID)
- res |= S_ISUID;
-
- if ((mode & P9_DMSETGID) == P9_DMSETGID)
- res |= S_ISGID;
-
- if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
- res |= S_ISVTX;
- }
return res;
}
@@ -275,8 +292,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
} else if (v9fs_proto_dotu(v9ses)) {
inode->i_op = &v9fs_file_inode_operations;
} else {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "special files without extended mode\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "special files without extended mode\n");
err = -EINVAL;
goto error;
}
@@ -301,8 +318,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
break;
case S_IFLNK:
if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
- "legacy protocol.\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "extended modes used with legacy protocol\n");
err = -EINVAL;
goto error;
}
@@ -329,8 +346,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
break;
default:
- P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
- mode, mode & S_IFMT);
+ p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
+ mode, mode & S_IFMT);
err = -EINVAL;
goto error;
}
@@ -352,11 +369,12 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
+ p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
inode = new_inode(sb);
if (!inode) {
- P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+ pr_warn("%s (%d): Problem allocating inode\n",
+ __func__, task_pid_nr(current));
return ERR_PTR(-ENOMEM);
}
err = v9fs_init_inode(v9ses, inode, mode, rdev);
@@ -573,15 +591,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
struct p9_fid *v9fid, *dfid;
struct v9fs_session_info *v9ses;
- P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
- dir, dentry, flags);
+ p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
+ dir, dentry, flags);
v9ses = v9fs_inode2v9ses(dir);
inode = dentry->d_inode;
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
retval = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
return retval;
}
if (v9fs_proto_dotl(v9ses))
@@ -630,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
struct p9_fid *dfid, *ofid, *fid;
struct inode *inode;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
err = 0;
ofid = NULL;
@@ -639,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
return ERR_PTR(err);
}
@@ -647,36 +665,41 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
ofid = p9_client_walk(dfid, 0, NULL, 1);
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
return ERR_PTR(err);
}
err = p9_client_fcreate(ofid, name, perm, mode, extension);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
- goto error;
- }
-
- /* now walk from the parent so we can get unopened fid */
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- fid = NULL;
+ p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
goto error;
}
- /* instantiate inode and assign the unopened fid to the dentry */
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
- goto error;
+ if (!(perm & P9_DMLINK)) {
+ /* now walk from the parent so we can get unopened fid */
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ p9_debug(P9_DEBUG_VFS,
+ "p9_client_walk failed %d\n", err);
+ fid = NULL;
+ goto error;
+ }
+ /*
+ * instantiate inode and assign the unopened fid to the dentry
+ */
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ p9_debug(P9_DEBUG_VFS,
+ "inode creation failed %d\n", err);
+ goto error;
+ }
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+ d_instantiate(dentry, inode);
}
- err = v9fs_fid_add(dentry, fid);
- if (err < 0)
- goto error;
- d_instantiate(dentry, inode);
return ofid;
error:
if (ofid)
@@ -788,7 +811,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct p9_fid *fid;
struct v9fs_session_info *v9ses;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -826,8 +849,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
char *name;
int result = 0;
- P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
- dir, dentry->d_name.name, dentry, nameidata);
+ p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+ dir, dentry->d_name.name, dentry, nameidata);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -933,7 +956,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct p9_fid *newdirfid;
struct p9_wstat wstat;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
retval = 0;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
@@ -969,8 +992,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* 9P .u can only handle file rename in the same directory
*/
- P9_DPRINTK(P9_DEBUG_ERROR,
- "old dir and new dir are different\n");
+ p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
retval = -EXDEV;
goto clunk_newdir;
}
@@ -1026,7 +1048,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct p9_fid *fid;
struct p9_wstat *st;
- P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+ p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -1063,7 +1085,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
struct p9_fid *fid;
struct p9_wstat wstat;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
retval = inode_change_ok(dentry->d_inode, iattr);
if (retval)
return retval;
@@ -1162,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
set_nlink(inode, i_nlink);
}
}
- mode = stat->mode & S_IALLUGO;
+ mode = p9mode2perm(v9ses, stat);
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
i_size_write(inode, stat->length);
@@ -1208,7 +1230,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
struct p9_fid *fid;
struct p9_wstat *st;
- P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
retval = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
fid = v9fs_fid_lookup(dentry);
@@ -1230,8 +1252,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
/* copy extension buffer into buffer */
strncpy(buffer, st->extension, buflen);
- P9_DPRINTK(P9_DEBUG_VFS,
- "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
+ p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n",
+ dentry->d_name.name, st->extension, buffer);
retval = strnlen(buffer, buflen);
done:
@@ -1252,7 +1274,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
int len = 0;
char *link = __getname();
- P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
if (!link)
link = ERR_PTR(-ENOMEM);
@@ -1283,8 +1305,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
{
char *s = nd_get_link(nd);
- P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
- IS_ERR(s) ? "<error>" : s);
+ p9_debug(P9_DEBUG_VFS, " %s %s\n",
+ dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
if (!IS_ERR(s))
__putname(s);
}
@@ -1306,7 +1328,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
v9ses = v9fs_inode2v9ses(dir);
if (!v9fs_proto_dotu(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
+ p9_debug(P9_DEBUG_ERROR, "not extended\n");
return -EPERM;
}
@@ -1333,8 +1355,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
static int
v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
{
- P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino,
- dentry->d_name.name, symname);
+ p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+ dir->i_ino, dentry->d_name.name, symname);
return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
}
@@ -1355,9 +1377,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
char *name;
struct p9_fid *oldfid;
- P9_DPRINTK(P9_DEBUG_VFS,
- " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
- old_dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+ dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
oldfid = v9fs_fid_clone(old_dentry);
if (IS_ERR(oldfid))
@@ -1398,9 +1419,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
char *name;
u32 perm;
- P9_DPRINTK(P9_DEBUG_VFS,
- " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino,
- dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+ p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
+ dir->i_ino, dentry->d_name.name, mode,
+ MAJOR(rdev), MINOR(rdev));
if (!new_valid_dev(rdev))
return -EINVAL;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 8ef152ac6a16..a1e6c990cd41 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -283,13 +283,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
}
name = (char *) dentry->d_name.name;
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
- "mode:0x%hx\n", name, flags, omode);
+ p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
+ name, flags, omode);
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
return err;
}
@@ -297,7 +297,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
ofid = p9_client_walk(dfid, 0, NULL, 1);
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
return err;
}
@@ -307,16 +307,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
/* Update mode based on ACL value */
err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "Failed to get acl values in creat %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
+ err);
goto error;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
mode, gid, &qid);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "p9_client_open_dotl failed in creat %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
+ err);
goto error;
}
v9fs_invalidate_inode_attr(dir);
@@ -325,14 +324,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
fid = NULL;
goto error;
}
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -408,7 +407,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
@@ -420,7 +419,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
dfid = NULL;
goto error;
}
@@ -430,8 +429,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
/* Update mode based on ACL value */
err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "Failed to get acl values in mkdir %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n",
+ err);
goto error;
}
name = (char *) dentry->d_name.name;
@@ -444,8 +443,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
fid = NULL;
goto error;
}
@@ -453,8 +452,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -495,7 +494,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
struct p9_fid *fid;
struct p9_stat_dotl *st;
- P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+ p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -523,6 +522,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+/*
+ * Attribute flags.
+ */
+#define P9_ATTR_MODE (1 << 0)
+#define P9_ATTR_UID (1 << 1)
+#define P9_ATTR_GID (1 << 2)
+#define P9_ATTR_SIZE (1 << 3)
+#define P9_ATTR_ATIME (1 << 4)
+#define P9_ATTR_MTIME (1 << 5)
+#define P9_ATTR_CTIME (1 << 6)
+#define P9_ATTR_ATIME_SET (1 << 7)
+#define P9_ATTR_MTIME_SET (1 << 8)
+
+struct dotl_iattr_map {
+ int iattr_valid;
+ int p9_iattr_valid;
+};
+
+static int v9fs_mapped_iattr_valid(int iattr_valid)
+{
+ int i;
+ int p9_iattr_valid = 0;
+ struct dotl_iattr_map dotl_iattr_map[] = {
+ { ATTR_MODE, P9_ATTR_MODE },
+ { ATTR_UID, P9_ATTR_UID },
+ { ATTR_GID, P9_ATTR_GID },
+ { ATTR_SIZE, P9_ATTR_SIZE },
+ { ATTR_ATIME, P9_ATTR_ATIME },
+ { ATTR_MTIME, P9_ATTR_MTIME },
+ { ATTR_CTIME, P9_ATTR_CTIME },
+ { ATTR_ATIME_SET, P9_ATTR_ATIME_SET },
+ { ATTR_MTIME_SET, P9_ATTR_MTIME_SET },
+ };
+ for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) {
+ if (iattr_valid & dotl_iattr_map[i].iattr_valid)
+ p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid;
+ }
+ return p9_iattr_valid;
+}
+
/**
* v9fs_vfs_setattr_dotl - set file metadata
* @dentry: file whose metadata to set
@@ -537,13 +576,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
struct p9_fid *fid;
struct p9_iattr_dotl p9attr;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
retval = inode_change_ok(dentry->d_inode, iattr);
if (retval)
return retval;
- p9attr.valid = iattr->ia_valid;
+ p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
p9attr.mode = iattr->ia_mode;
p9attr.uid = iattr->ia_uid;
p9attr.gid = iattr->ia_gid;
@@ -670,14 +709,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
struct v9fs_session_info *v9ses;
name = (char *) dentry->d_name.name;
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
- dir->i_ino, name, symname);
+ p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
v9ses = v9fs_inode2v9ses(dir);
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
return err;
}
@@ -687,7 +725,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
goto error;
}
@@ -697,8 +735,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
fid = NULL;
goto error;
}
@@ -707,8 +745,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -751,9 +789,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
struct p9_fid *dfid, *oldfid;
struct v9fs_session_info *v9ses;
- P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
- dir->i_ino, old_dentry->d_name.name,
- dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+ dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = v9fs_dentry_from_dir_inode(dir);
@@ -770,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
return err;
}
@@ -813,9 +850,9 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
- P9_DPRINTK(P9_DEBUG_VFS,
- " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino,
- dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+ p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
+ dir->i_ino, dentry->d_name.name, omode,
+ MAJOR(rdev), MINOR(rdev));
if (!new_valid_dev(rdev))
return -EINVAL;
@@ -825,7 +862,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
dfid = NULL;
goto error;
}
@@ -835,8 +872,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
/* Update mode based on ACL value */
err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "Failed to get acl values in mknod %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n",
+ err);
goto error;
}
name = (char *) dentry->d_name.name;
@@ -851,8 +888,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
fid = NULL;
goto error;
}
@@ -860,8 +897,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -905,7 +942,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
char *link = __getname();
char *target;
- P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
if (!link) {
link = ERR_PTR(-ENOMEM);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f68ff65a32a5..7b0cd87b07c2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -121,7 +121,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
struct p9_fid *fid;
int retval = 0;
- P9_DPRINTK(P9_DEBUG_VFS, " \n");
+ p9_debug(P9_DEBUG_VFS, "\n");
v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
if (!v9ses)
@@ -191,7 +191,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
v9fs_fid_add(root, fid);
- P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+ p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
return dget(sb->s_root);
clunk_fid:
@@ -223,7 +223,7 @@ static void v9fs_kill_super(struct super_block *s)
{
struct v9fs_session_info *v9ses = s->s_fs_info;
- P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
+ p9_debug(P9_DEBUG_VFS, " %p\n", s);
kill_anon_super(s);
@@ -231,7 +231,7 @@ static void v9fs_kill_super(struct super_block *s)
v9fs_session_close(v9ses);
kfree(v9ses);
s->s_fs_info = NULL;
- P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
+ p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
}
static void
@@ -303,7 +303,7 @@ static int v9fs_write_inode(struct inode *inode,
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
- P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
v9inode = V9FS_I(inode);
if (!v9inode->writeback_fid)
return 0;
@@ -326,7 +326,7 @@ static int v9fs_write_inode_dotl(struct inode *inode,
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
- P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
v9inode = V9FS_I(inode);
if (!v9inode->writeback_fid)
return 0;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index d288773871b3..29653b70a9c3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
retval = PTR_ERR(attr_fid);
- P9_DPRINTK(P9_DEBUG_VFS,
- "p9_client_attrwalk failed %zd\n", retval);
+ p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
+ retval);
attr_fid = NULL;
goto error;
}
@@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
{
struct p9_fid *fid;
- P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
- __func__, name, buffer_size);
+ p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+ name, buffer_size);
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
int retval, msize, write_count;
struct p9_fid *fid = NULL;
- P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
- __func__, name, value_len, flags);
+ p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
+ name, value_len, flags);
fid = v9fs_fid_clone(dentry);
if (IS_ERR(fid)) {
@@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
*/
retval = p9_client_xattrcreate(fid, name, value_len, flags);
if (retval < 0) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "p9_client_xattrcreate failed %d\n", retval);
+ p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
+ retval);
goto error;
}
msize = fid->clnt->msize;
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b7..e95d1b64082c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
bool
depends on COMPAT && BINFMT_ELF
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+ bool
+
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
diff --git a/fs/aio.c b/fs/aio.c
index 78c514cfd212..969beb0e2231 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -476,14 +476,21 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total)
batch->count = total;
}
-static void kiocb_batch_free(struct kiocb_batch *batch)
+static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
{
struct kiocb *req, *n;
+ if (list_empty(&batch->head))
+ return;
+
+ spin_lock_irq(&ctx->ctx_lock);
list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
list_del(&req->ki_batch);
+ list_del(&req->ki_list);
kmem_cache_free(kiocb_cachep, req);
+ ctx->reqs_active--;
}
+ spin_unlock_irq(&ctx->ctx_lock);
}
/*
@@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
}
blk_finish_plug(&plug);
- kiocb_batch_free(&batch);
+ kiocb_batch_free(ctx, &batch);
put_ioctx(ctx);
return i ? i : ret;
}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5869d4e974a9..d8d8e7ba6a1e 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -116,6 +116,7 @@ struct autofs_sb_info {
int needs_reghost;
struct super_block *sb;
struct mutex wq_mutex;
+ struct mutex pipe_mutex;
spinlock_t fs_lock;
struct autofs_wait_queue *queues; /* Wait queue pointer */
spinlock_t lookup_lock;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2ba44c79d548..e16980b00b8d 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -225,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->min_proto = 0;
sbi->max_proto = 0;
mutex_init(&sbi->wq_mutex);
+ mutex_init(&sbi->pipe_mutex);
spin_lock_init(&sbi->fs_lock);
sbi->queues = NULL;
spin_lock_init(&sbi->lookup_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e1fbdeef85db..da8876d38a7b 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
mutex_unlock(&sbi->wq_mutex);
}
-static int autofs4_write(struct file *file, const void *addr, int bytes)
+static int autofs4_write(struct autofs_sb_info *sbi,
+ struct file *file, const void *addr, int bytes)
{
unsigned long sigpipe, flags;
mm_segment_t fs;
const char *data = (const char *)addr;
ssize_t wr = 0;
- /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-
sigpipe = sigismember(&current->pending.signal, SIGPIPE);
/* Save pointer to user space and point back to kernel space */
fs = get_fs();
set_fs(KERNEL_DS);
+ mutex_lock(&sbi->pipe_mutex);
while (bytes &&
(wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
data += wr;
bytes -= wr;
}
+ mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
@@ -110,6 +111,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = type;
+ mutex_lock(&sbi->wq_mutex);
+
+ /* Check if we have become catatonic */
+ if (sbi->catatonic) {
+ mutex_unlock(&sbi->wq_mutex);
+ return;
+ }
switch (type) {
/* Kernel protocol v4 missing and expire packets */
case autofs_ptype_missing:
@@ -163,22 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
}
default:
printk("autofs4_notify_daemon: bad type %d!\n", type);
+ mutex_unlock(&sbi->wq_mutex);
return;
}
- /* Check if we have become catatonic */
- mutex_lock(&sbi->wq_mutex);
- if (!sbi->catatonic) {
- pipe = sbi->pipe;
- get_file(pipe);
- }
+ pipe = sbi->pipe;
+ get_file(pipe);
+
mutex_unlock(&sbi->wq_mutex);
- if (pipe) {
- if (autofs4_write(pipe, &pkt, pktsz))
- autofs4_catatonic_mode(sbi);
- fput(pipe);
- }
+ if (autofs4_write(sbi, pipe, &pkt, pktsz))
+ autofs4_catatonic_mode(sbi);
+ fput(pipe);
}
static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -257,6 +261,9 @@ static int validate_request(struct autofs_wait_queue **wait,
struct autofs_wait_queue *wq;
struct autofs_info *ino;
+ if (sbi->catatonic)
+ return -ENOENT;
+
/* Wait in progress, continue; */
wq = autofs4_find_wait(sbi, qstr);
if (wq) {
@@ -289,6 +296,9 @@ static int validate_request(struct autofs_wait_queue **wait,
if (mutex_lock_interruptible(&sbi->wq_mutex))
return -EINTR;
+ if (sbi->catatonic)
+ return -ENOENT;
+
wq = autofs4_find_wait(sbi, qstr);
if (wq) {
*wait = wq;
@@ -389,7 +399,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
ret = validate_request(&wq, sbi, &qstr, dentry, notify);
if (ret <= 0) {
- if (ret == 0)
+ if (ret != -EINTR)
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
return ret;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 21ac5ee4b43f..bcb884e2d613 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
* default mmap base, as well as whatever program they
* might try to exec. This is because the brk will
* follow the loader, and is not movable. */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
/* Memory randomization might have been switched off
* in runtime via sysctl.
* If that is the case, retain the original non-zero
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69a5b6fbee2b..0e575d1304b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,7 +25,6 @@
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
-#include <linux/kmemleak.h>
#include <linux/cleancache.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -521,7 +520,7 @@ static struct super_block *blockdev_superblock __read_mostly;
void __init bdev_cache_init(void)
{
int err;
- struct vfsmount *bd_mnt;
+ static struct vfsmount *bd_mnt;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -533,12 +532,7 @@ void __init bdev_cache_init(void)
bd_mnt = kern_mount(&bd_type);
if (IS_ERR(bd_mnt))
panic("Cannot create bdev pseudo-fs");
- /*
- * This vfsmount structure is only used to obtain the
- * blockdev_superblock, so tell kmemleak not to report it.
- */
- kmemleak_not_leak(bd_mnt);
- blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
+ blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
/*
@@ -1145,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
bdev->bd_disk = disk;
+ bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
struct backing_dev_info *bdi;
@@ -1165,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
+ bdev->bd_queue = NULL;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
put_disk(disk);
@@ -1238,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
+ bdev->bd_queue = NULL;
bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a7747..d8525662ca7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
#ifdef CONFIG_MIGRATION
static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
@@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping,
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c050..034d98503229 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
again:
for (i = 0; i < num_pages; i++) {
pages[i] = find_or_create_page(inode->i_mapping, index + i,
- mask);
+ mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
err = -ENOMEM;
@@ -1136,7 +1136,8 @@ again:
GFP_NOFS);
}
for (i = 0; i < num_pages; i++) {
- clear_page_dirty_for_io(pages[i]);
+ if (clear_page_dirty_for_io(pages[i]))
+ account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 74fd74719dc2..618246bc2196 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -973,7 +973,7 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di && di->lease_session) {
+ if (di->lease_session) {
s = di->lease_session;
spin_lock(&s->s_cap_lock);
gen = s->s_cap_gen;
@@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
dout("d_release %p\n", dentry);
- if (di) {
- ceph_dentry_lru_del(dentry);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
- kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
- }
+ ceph_dentry_lru_del(dentry);
+ if (di->lease_session)
+ ceph_put_mds_session(di->lease_session);
+ kmem_cache_free(ceph_dentry_cachep, di);
+ dentry->d_fsdata = NULL;
}
static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
*/
void ceph_dir_set_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry) &&
+ ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
+ dout(" marking %p (%p) complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
}
void ceph_dir_clear_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
}
bool ceph_dir_test_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) NOT complete\n", inode, dentry);
+ clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
return false;
}
@@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
do {
ceph_mdsc_get_request(req);
spin_unlock(&ci->i_unsafe_lock);
+
dout("dir_fsync %p wait on tid %llu (until %llu)\n",
inode, req->r_tid, last_tid);
if (req->r_timeout) {
@@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
} else {
wait_for_completion(&req->r_safe_completion);
}
- spin_lock(&ci->i_unsafe_lock);
ceph_mdsc_put_request(req);
+ spin_lock(&ci->i_unsafe_lock);
if (ret || list_empty(head))
break;
req = list_entry(head->next,
@@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_add_tail(&di->lru, &mdsc->dentry_lru);
+ mdsc->num_dentry++;
+ spin_unlock(&mdsc->dentry_lru_lock);
}
void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
dn->d_name.len, dn->d_name.name, di->offset);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_move_tail(&di->lru, &mdsc->dentry_lru);
+ spin_unlock(&mdsc->dentry_lru_lock);
}
void ceph_dentry_lru_del(struct dentry *dn)
@@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn)
dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_del_init(&di->lru);
+ mdsc->num_dentry--;
+ spin_unlock(&mdsc->dentry_lru_lock);
}
/*
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9fbcdecaaccd..fbb2a643ef10 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
return -EINVAL;
spin_lock(&dentry->d_lock);
- parent = dget(dentry->d_parent);
- spin_unlock(&dentry->d_lock);
-
+ parent = dentry->d_parent;
if (*max_len >= connected_handle_length) {
dout("encode_fh %p connectable\n", dentry);
cfh->ino = ceph_ino(dentry->d_inode);
@@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
*max_len = handle_length;
type = 255;
}
- dput(parent);
+ spin_unlock(&dentry->d_lock);
return type;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 25283e7a37f8..2c489378b4cd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -850,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn)
{
struct dentry *dir = dn->d_parent;
struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_info *ci;
struct ceph_dentry_info *di;
BUG_ON(!inode);
+ ci = ceph_inode(inode);
di = ceph_dentry(dn);
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb45..23ab6a3f1825 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2772,7 +2772,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
di = ceph_dentry(dentry);
switch (h->action) {
case CEPH_MDS_LEASE_REVOKE:
- if (di && di->lease_session == session) {
+ if (di->lease_session == session) {
if (ceph_seq_cmp(di->lease_seq, seq) > 0)
h->seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
@@ -2781,7 +2781,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
break;
case CEPH_MDS_LEASE_RENEW:
- if (di && di->lease_session == session &&
+ if (di->lease_session == session &&
di->lease_gen == session->s_cap_gen &&
di->lease_renew_from &&
di->lease_renew_after == 0) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 48f61a12af66..00de2c9568cd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,8 @@ enum {
Opt_rbytes,
Opt_norbytes,
Opt_noasyncreaddir,
+ Opt_dcache,
+ Opt_nodcache,
Opt_ino32,
};
@@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = {
{Opt_rbytes, "rbytes"},
{Opt_norbytes, "norbytes"},
{Opt_noasyncreaddir, "noasyncreaddir"},
+ {Opt_dcache, "dcache"},
+ {Opt_nodcache, "nodcache"},
{Opt_ino32, "ino32"},
{-1, NULL}
};
@@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noasyncreaddir:
fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
break;
+ case Opt_dcache:
+ fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+ break;
+ case Opt_nodcache:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+ break;
case Opt_ino32:
fsopt->flags |= CEPH_MOUNT_OPT_INO32;
break;
@@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",norbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
+ if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
+ seq_puts(m, ",dcache");
+ else
+ seq_puts(m, ",nodcache");
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -647,10 +661,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
root = ERR_PTR(-ENOMEM);
goto out;
}
- ceph_init_dentry(root);
} else {
root = d_obtain_alias(inode);
}
+ ceph_init_dentry(root);
dout("open_root_inode success, root dentry is %p\n", root);
} else {
root = ERR_PTR(err);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cb3652b37271..1421f3d875a2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -28,6 +28,7 @@
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a7..857214ae8c08 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -818,6 +818,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
int issued;
int err;
+ int required_blob_size;
int dirty;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -833,14 +834,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
return -EOPNOTSUPP;
}
+ err = -ENOMEM;
spin_lock(&ci->i_ceph_lock);
__build_xattrs(inode);
+retry:
issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
if (!(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
+ required_blob_size = __get_required_blob_size(ci, 0, 0);
+
+ if (!ci->i_xattrs.prealloc_blob ||
+ required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+ struct ceph_buffer *blob;
+
+ spin_unlock(&ci->i_ceph_lock);
+ dout(" preaallocating new blob size=%d\n", required_blob_size);
+ blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+ if (!blob)
+ goto out;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_xattrs.prealloc_blob)
+ ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ci->i_xattrs.prealloc_blob = blob;
+ goto retry;
+ }
+
err = __remove_xattr_by_name(ceph_inode(inode), name);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
ci->i_xattrs.dirty = true;
@@ -853,6 +874,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
do_sync:
spin_unlock(&ci->i_ceph_lock);
err = ceph_send_removexattr(dentry, name);
+out:
return err;
}
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 6475877b0763..911cf30d057d 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
- link the two up if this is needed
- fill in the attributes
*/
-int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb)
+struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
{
struct coda_vattr attr;
+ struct inode *inode;
int error;
/* We get inode numbers from Venus -- see venus source */
error = venus_getattr(sb, fid, &attr);
- if ( error ) {
- *inode = NULL;
- return error;
- }
+ if (error)
+ return ERR_PTR(error);
- *inode = coda_iget(sb, fid, &attr);
- if ( IS_ERR(*inode) ) {
+ inode = coda_iget(sb, fid, &attr);
+ if (IS_ERR(inode))
printk("coda_cnode_make: coda_iget failed\n");
- return PTR_ERR(*inode);
- }
- return 0;
+ return inode;
}
@@ -156,19 +153,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
}
/* the CONTROL inode is made without asking attributes from Venus */
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb)
+struct inode *coda_cnode_makectl(struct super_block *sb)
{
- int error = -ENOMEM;
-
- *inode = new_inode(sb);
- if (*inode) {
- (*inode)->i_ino = CTL_INO;
- (*inode)->i_op = &coda_ioctl_inode_operations;
- (*inode)->i_fop = &coda_ioctl_operations;
- (*inode)->i_mode = 0444;
- error = 0;
+ struct inode *inode = new_inode(sb);
+ if (inode) {
+ inode->i_ino = CTL_INO;
+ inode->i_op = &coda_ioctl_inode_operations;
+ inode->i_fop = &coda_ioctl_operations;
+ inode->i_mode = 0444;
+ return inode;
}
-
- return error;
+ return ERR_PTR(-ENOMEM);
}
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index e35071b1de0e..b24fdfd8a3f0 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -49,9 +49,9 @@ struct coda_file_info {
#define C_DYING 0x4 /* from venus (which died) */
#define C_PURGE 0x8
-int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_cnode_makectl(struct super_block *sb);
struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 83d2fd8ec24b..177515829062 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = {
/* access routines: lookup, readlink, permission */
static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
{
- struct inode *inode = NULL;
- struct CodaFid resfid = { { 0, } };
- int type = 0;
- int error = 0;
+ struct super_block *sb = dir->i_sb;
const char *name = entry->d_name.name;
size_t length = entry->d_name.len;
+ struct inode *inode;
+ int type = 0;
if (length > CODA_MAXNAMLEN) {
printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
@@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
/* control object, create inode on the fly */
if (coda_isroot(dir) && coda_iscontrol(name, length)) {
- error = coda_cnode_makectl(&inode, dir->i_sb);
+ inode = coda_cnode_makectl(sb);
type = CODA_NOCACHE;
- goto exit;
+ } else {
+ struct CodaFid fid = { { 0, } };
+ int error = venus_lookup(sb, coda_i2f(dir), name, length,
+ &type, &fid);
+ inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
}
- error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
- &type, &resfid);
- if (!error)
- error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-
- if (error && error != -ENOENT)
- return ERR_PTR(error);
-
-exit:
- if (inode && (type & CODA_NOCACHE))
+ if (!IS_ERR(inode) && (type & CODA_NOCACHE))
coda_flag_inode(inode, C_VATTR | C_PURGE);
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
+
return d_splice_alias(inode, entry);
}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 1c08a8cd673a..5e2e1b3f068d 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -204,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
/* make root inode */
- error = coda_cnode_make(&root, &fid, sb);
- if ( error || !root ) {
- printk("Failure of coda_cnode_make for root: error %d\n", error);
- goto error;
+ root = coda_cnode_make(&fid, sb);
+ if (IS_ERR(root)) {
+ error = PTR_ERR(root);
+ printk("Failure of coda_cnode_make for root: error %d\n", error);
+ root = NULL;
+ goto error;
}
printk("coda_read_super: rootinode is %ld dev %s\n",
diff --git a/fs/dcache.c b/fs/dcache.c
index 3c6d3113a255..16a53cc2cc02 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -243,6 +243,7 @@ static void dentry_lru_add(struct dentry *dentry)
static void __dentry_lru_del(struct dentry *dentry)
{
list_del_init(&dentry->d_lru);
+ dentry->d_flags &= ~DCACHE_SHRINK_LIST;
dentry->d_sb->s_nr_dentry_unused--;
dentry_stat.nr_unused--;
}
@@ -806,6 +807,7 @@ relock:
spin_unlock(&dentry->d_lock);
} else {
list_move_tail(&dentry->d_lru, &tmp);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
spin_unlock(&dentry->d_lock);
if (!--count)
break;
@@ -1097,14 +1099,19 @@ resume:
/*
* move only zero ref count dentries to the dispose list.
+ *
+ * Those which are presently on the shrink list, being processed
+ * by shrink_dentry_list(), shouldn't be moved. Otherwise the
+ * loop in shrink_dcache_parent() might not make any progress
+ * and loop forever.
*/
- if (!dentry->d_count) {
+ if (dentry->d_count) {
+ dentry_lru_del(dentry);
+ } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
dentry_lru_move_list(dentry, dispose);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
found++;
- } else {
- dentry_lru_del(dentry);
}
-
/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
@@ -1468,7 +1475,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
return alias;
}
-static struct dentry * d_find_any_alias(struct inode *inode)
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them. If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
{
struct dentry *de;
@@ -1477,7 +1491,7 @@ static struct dentry * d_find_any_alias(struct inode *inode)
spin_unlock(&inode->i_lock);
return de;
}
-
+EXPORT_SYMBOL(d_find_any_alias);
/**
* d_obtain_alias - find or allocate a dentry for a given inode
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6e..4a588dbd11bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <linux/atomic.h>
+#include <linux/prefetch.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
{
int ret;
sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
+ sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
unsigned long fs_count; /* Number of filesystem-sized blocks */
- unsigned long dio_count;/* Number of dio_block-sized blocks */
- unsigned long blkmask;
int create;
/*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
if (ret == 0) {
BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
fs_startblk = sdio->block_in_file >> sdio->blkfactor;
- dio_count = sdio->final_block_in_request - sdio->block_in_file;
- fs_count = dio_count >> sdio->blkfactor;
- blkmask = (1 << sdio->blkfactor) - 1;
- if (dio_count & blkmask)
- fs_count++;
+ fs_endblk = (sdio->final_block_in_request - 1) >>
+ sdio->blkfactor;
+ fs_count = fs_endblk - fs_startblk + 1;
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important
* for the whole file.
*/
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static inline ssize_t
+do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
size_t size;
unsigned long addr;
unsigned blkbits = inode->i_blkbits;
- unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw & WRITE)
rw = WRITE_ODIRECT;
- if (bdev)
- bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ /*
+ * Avoid references to bdev if not absolutely needed to give
+ * the early prefetch in the caller enough time.
+ */
if (offset & blocksize_mask) {
if (bdev)
- blkbits = bdev_blkbits;
+ blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask)
goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask)) {
+ if (unlikely((addr & blocksize_mask) ||
+ (size & blocksize_mask))) {
if (bdev)
- blkbits = bdev_blkbits;
+ blkbits = blksize_bits(
+ bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out;
}
}
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
out:
return retval;
}
+
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
+{
+ /*
+ * The block device state is needed in the end to finally
+ * submit everything. Since it's likely to be cache cold
+ * prefetch it here as first thing to hide some of the
+ * latency.
+ *
+ * Attempt to prefetch the pieces we likely need later.
+ */
+ prefetch(&bdev->bd_disk->part_tbl);
+ prefetch(bdev->bd_queue);
+ prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+
+ return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+ nr_segs, get_block, end_io,
+ submit_io, flags);
+}
+
EXPORT_SYMBOL(__blockdev_direct_IO);
static __init int dio_init(void)
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 6cf72fcc0d0c..e7e327d43fa5 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/in6.h>
+#include <linux/dlmconstants.h>
#include <net/ipv6.h>
#include <net/sock.h>
@@ -36,6 +37,7 @@
static struct config_group *space_list;
static struct config_group *comm_list;
static struct dlm_comm *local_comm;
+static uint32_t dlm_comm_count;
struct dlm_clusters;
struct dlm_cluster;
@@ -103,6 +105,8 @@ struct dlm_cluster {
unsigned int cl_timewarn_cs;
unsigned int cl_waitwarn_us;
unsigned int cl_new_rsb_count;
+ unsigned int cl_recover_callbacks;
+ char cl_cluster_name[DLM_LOCKSPACE_LEN];
};
enum {
@@ -118,6 +122,8 @@ enum {
CLUSTER_ATTR_TIMEWARN_CS,
CLUSTER_ATTR_WAITWARN_US,
CLUSTER_ATTR_NEW_RSB_COUNT,
+ CLUSTER_ATTR_RECOVER_CALLBACKS,
+ CLUSTER_ATTR_CLUSTER_NAME,
};
struct cluster_attribute {
@@ -126,6 +132,27 @@ struct cluster_attribute {
ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
};
+static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+{
+ return sprintf(buf, "%s\n", cl->cl_cluster_name);
+}
+
+static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+ const char *buf, size_t len)
+{
+ strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ return len;
+}
+
+static struct cluster_attribute cluster_attr_cluster_name = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "cluster_name",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = cluster_cluster_name_read,
+ .store = cluster_cluster_name_write,
+};
+
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
int *info_field, int check_zero,
const char *buf, size_t len)
@@ -171,6 +198,7 @@ CLUSTER_ATTR(protocol, 0);
CLUSTER_ATTR(timewarn_cs, 1);
CLUSTER_ATTR(waitwarn_us, 0);
CLUSTER_ATTR(new_rsb_count, 0);
+CLUSTER_ATTR(recover_callbacks, 0);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -185,6 +213,8 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
+ [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
+ [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
NULL,
};
@@ -293,6 +323,7 @@ struct dlm_comms {
struct dlm_comm {
struct config_item item;
+ int seq;
int nodeid;
int local;
int addr_count;
@@ -309,6 +340,7 @@ struct dlm_node {
int nodeid;
int weight;
int new;
+ int comm_seq; /* copy of cm->seq when nd->nodeid is set */
};
static struct configfs_group_operations clusters_ops = {
@@ -455,6 +487,9 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
+ cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
+ memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
+ DLM_LOCKSPACE_LEN);
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
@@ -558,6 +593,11 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
return ERR_PTR(-ENOMEM);
config_item_init_type_name(&cm->item, name, &comm_type);
+
+ cm->seq = dlm_comm_count++;
+ if (!cm->seq)
+ cm->seq = dlm_comm_count++;
+
cm->nodeid = -1;
cm->local = 0;
cm->addr_count = 0;
@@ -801,7 +841,10 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
size_t len)
{
+ uint32_t seq = 0;
nd->nodeid = simple_strtol(buf, NULL, 0);
+ dlm_comm_seq(nd->nodeid, &seq);
+ nd->comm_seq = seq;
return len;
}
@@ -908,13 +951,13 @@ static void put_comm(struct dlm_comm *cm)
}
/* caller must free mem */
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
- int **new_out, int *new_count_out)
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+ int *count_out)
{
struct dlm_space *sp;
struct dlm_node *nd;
- int i = 0, rv = 0, ids_count = 0, new_count = 0;
- int *ids, *new;
+ struct dlm_config_node *nodes, *node;
+ int rv, count;
sp = get_space(lsname);
if (!sp)
@@ -927,73 +970,42 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
goto out;
}
- ids_count = sp->members_count;
+ count = sp->members_count;
- ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
- if (!ids) {
+ nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
+ if (!nodes) {
rv = -ENOMEM;
goto out;
}
+ node = nodes;
list_for_each_entry(nd, &sp->members, list) {
- ids[i++] = nd->nodeid;
- if (nd->new)
- new_count++;
- }
-
- if (ids_count != i)
- printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
-
- if (!new_count)
- goto out_ids;
+ node->nodeid = nd->nodeid;
+ node->weight = nd->weight;
+ node->new = nd->new;
+ node->comm_seq = nd->comm_seq;
+ node++;
- new = kcalloc(new_count, sizeof(int), GFP_NOFS);
- if (!new) {
- kfree(ids);
- rv = -ENOMEM;
- goto out;
+ nd->new = 0;
}
- i = 0;
- list_for_each_entry(nd, &sp->members, list) {
- if (nd->new) {
- new[i++] = nd->nodeid;
- nd->new = 0;
- }
- }
- *new_count_out = new_count;
- *new_out = new;
-
- out_ids:
- *ids_count_out = ids_count;
- *ids_out = ids;
+ *count_out = count;
+ *nodes_out = nodes;
+ rv = 0;
out:
mutex_unlock(&sp->members_lock);
put_space(sp);
return rv;
}
-int dlm_node_weight(char *lsname, int nodeid)
+int dlm_comm_seq(int nodeid, uint32_t *seq)
{
- struct dlm_space *sp;
- struct dlm_node *nd;
- int w = -EEXIST;
-
- sp = get_space(lsname);
- if (!sp)
- goto out;
-
- mutex_lock(&sp->members_lock);
- list_for_each_entry(nd, &sp->members, list) {
- if (nd->nodeid != nodeid)
- continue;
- w = nd->weight;
- break;
- }
- mutex_unlock(&sp->members_lock);
- put_space(sp);
- out:
- return w;
+ struct dlm_comm *cm = get_comm(nodeid, NULL);
+ if (!cm)
+ return -EEXIST;
+ *seq = cm->seq;
+ put_comm(cm);
+ return 0;
}
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
@@ -1047,6 +1059,8 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
#define DEFAULT_WAITWARN_US 0
#define DEFAULT_NEW_RSB_COUNT 128
+#define DEFAULT_RECOVER_CALLBACKS 0
+#define DEFAULT_CLUSTER_NAME ""
struct dlm_config_info dlm_config = {
.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -1060,6 +1074,8 @@ struct dlm_config_info dlm_config = {
.ci_protocol = DEFAULT_PROTOCOL,
.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
.ci_waitwarn_us = DEFAULT_WAITWARN_US,
- .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT
+ .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
+ .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
+ .ci_cluster_name = DEFAULT_CLUSTER_NAME
};
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 3099d0dd26c0..9f5e3663bb0c 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,13 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
+struct dlm_config_node {
+ int nodeid;
+ int weight;
+ int new;
+ uint32_t comm_seq;
+};
+
#define DLM_MAX_ADDR_COUNT 3
struct dlm_config_info {
@@ -29,15 +36,17 @@ struct dlm_config_info {
int ci_timewarn_cs;
int ci_waitwarn_us;
int ci_new_rsb_count;
+ int ci_recover_callbacks;
+ char ci_cluster_name[DLM_LOCKSPACE_LEN];
};
extern struct dlm_config_info dlm_config;
int dlm_config_init(void);
void dlm_config_exit(void);
-int dlm_node_weight(char *lsname, int nodeid);
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
- int **new_out, int *new_count_out);
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+ int *count_out);
+int dlm_comm_seq(int nodeid, uint32_t *seq);
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
int dlm_our_nodeid(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 59779237e2b4..3dca2b39e83f 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -393,6 +393,7 @@ static const struct seq_operations format3_seq_ops;
static void *table_seq_start(struct seq_file *seq, loff_t *pos)
{
+ struct rb_node *node;
struct dlm_ls *ls = seq->private;
struct rsbtbl_iter *ri;
struct dlm_rsb *r;
@@ -418,9 +419,10 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
ri->format = 3;
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
- list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
- res_hashchain) {
+ if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+ for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node;
+ node = rb_next(node)) {
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
if (!entry--) {
dlm_hold_rsb(r);
ri->rsb = r;
@@ -449,9 +451,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
}
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
- r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
- struct dlm_rsb, res_hashchain);
+ if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+ node = rb_first(&ls->ls_rsbtbl[bucket].keep);
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
dlm_hold_rsb(r);
ri->rsb = r;
ri->bucket = bucket;
@@ -467,7 +469,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
{
struct dlm_ls *ls = seq->private;
struct rsbtbl_iter *ri = iter_ptr;
- struct list_head *next;
+ struct rb_node *next;
struct dlm_rsb *r, *rp;
loff_t n = *pos;
unsigned bucket;
@@ -480,10 +482,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
spin_lock(&ls->ls_rsbtbl[bucket].lock);
rp = ri->rsb;
- next = rp->res_hashchain.next;
+ next = rb_next(&rp->res_hashnode);
- if (next != &ls->ls_rsbtbl[bucket].list) {
- r = list_entry(next, struct dlm_rsb, res_hashchain);
+ if (next) {
+ r = rb_entry(next, struct dlm_rsb, res_hashnode);
dlm_hold_rsb(r);
ri->rsb = r;
spin_unlock(&ls->ls_rsbtbl[bucket].lock);
@@ -511,9 +513,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
}
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
- r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
- struct dlm_rsb, res_hashchain);
+ if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+ next = rb_first(&ls->ls_rsbtbl[bucket].keep);
+ r = rb_entry(next, struct dlm_rsb, res_hashnode);
dlm_hold_rsb(r);
ri->rsb = r;
ri->bucket = bucket;
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 7b84c1dbc82e..83641574b016 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -290,7 +290,6 @@ int dlm_recover_directory(struct dlm_ls *ls)
out_status:
error = 0;
- dlm_set_recover_status(ls, DLM_RS_DIR);
log_debug(ls, "dlm_recover_directory %d entries", count);
out_free:
kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index fe2860c02449..3a564d197e99 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -103,8 +103,8 @@ struct dlm_dirtable {
};
struct dlm_rsbtable {
- struct list_head list;
- struct list_head toss;
+ struct rb_root keep;
+ struct rb_root toss;
spinlock_t lock;
};
@@ -117,6 +117,10 @@ struct dlm_member {
struct list_head list;
int nodeid;
int weight;
+ int slot;
+ int slot_prev;
+ int comm_seq;
+ uint32_t generation;
};
/*
@@ -125,10 +129,8 @@ struct dlm_member {
struct dlm_recover {
struct list_head list;
- int *nodeids; /* nodeids of all members */
- int node_count;
- int *new; /* nodeids of new members */
- int new_count;
+ struct dlm_config_node *nodes;
+ int nodes_count;
uint64_t seq;
};
@@ -285,7 +287,10 @@ struct dlm_rsb {
unsigned long res_toss_time;
uint32_t res_first_lkid;
struct list_head res_lookup; /* lkbs waiting on first */
- struct list_head res_hashchain; /* rsbtbl */
+ union {
+ struct list_head res_hashchain;
+ struct rb_node res_hashnode; /* rsbtbl */
+ };
struct list_head res_grantqueue;
struct list_head res_convertqueue;
struct list_head res_waitqueue;
@@ -334,7 +339,9 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
/* dlm_header is first element of all structs sent between nodes */
#define DLM_HEADER_MAJOR 0x00030000
-#define DLM_HEADER_MINOR 0x00000000
+#define DLM_HEADER_MINOR 0x00000001
+
+#define DLM_HEADER_SLOTS 0x00000001
#define DLM_MSG 1
#define DLM_RCOM 2
@@ -422,10 +429,34 @@ union dlm_packet {
struct dlm_rcom rcom;
};
+#define DLM_RSF_NEED_SLOTS 0x00000001
+
+/* RCOM_STATUS data */
+struct rcom_status {
+ __le32 rs_flags;
+ __le32 rs_unused1;
+ __le64 rs_unused2;
+};
+
+/* RCOM_STATUS_REPLY data */
struct rcom_config {
__le32 rf_lvblen;
__le32 rf_lsflags;
- __le64 rf_unused;
+
+ /* DLM_HEADER_SLOTS adds: */
+ __le32 rf_flags;
+ __le16 rf_our_slot;
+ __le16 rf_num_slots;
+ __le32 rf_generation;
+ __le32 rf_unused1;
+ __le64 rf_unused2;
+};
+
+struct rcom_slot {
+ __le32 ro_nodeid;
+ __le16 ro_slot;
+ __le16 ro_unused1;
+ __le64 ro_unused2;
};
struct rcom_lock {
@@ -452,6 +483,7 @@ struct dlm_ls {
struct list_head ls_list; /* list of lockspaces */
dlm_lockspace_t *ls_local_handle;
uint32_t ls_global_id; /* global unique lockspace ID */
+ uint32_t ls_generation;
uint32_t ls_exflags;
int ls_lvblen;
int ls_count; /* refcount of processes in
@@ -490,6 +522,11 @@ struct dlm_ls {
int ls_total_weight;
int *ls_node_array;
+ int ls_slot;
+ int ls_num_slots;
+ int ls_slots_size;
+ struct dlm_slot *ls_slots;
+
struct dlm_rsb ls_stub_rsb; /* for returning errors */
struct dlm_lkb ls_stub_lkb; /* for returning errors */
struct dlm_message ls_stub_ms; /* for faking a reply */
@@ -537,6 +574,9 @@ struct dlm_ls {
struct list_head ls_root_list; /* root resources */
struct rw_semaphore ls_root_sem; /* protect root_list */
+ const struct dlm_lockspace_ops *ls_ops;
+ void *ls_ops_arg;
+
int ls_namelen;
char ls_name[1];
};
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 83b5e32514e1..d47183043c59 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
L: receive_xxxx_reply() <- R: send_xxxx_reply()
*/
#include <linux/types.h>
+#include <linux/rbtree.h>
#include <linux/slab.h>
#include "dlm_internal.h"
#include <linux/dlm_device.h>
@@ -380,6 +381,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
list_del(&r->res_hashchain);
+ /* Convert the empty list_head to a NULL rb_node for tree usage: */
+ memset(&r->res_hashnode, 0, sizeof(struct rb_node));
ls->ls_new_rsb_count--;
spin_unlock(&ls->ls_new_rsb_spin);
@@ -388,7 +391,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
memcpy(r->res_name, name, len);
mutex_init(&r->res_mutex);
- INIT_LIST_HEAD(&r->res_hashchain);
INIT_LIST_HEAD(&r->res_lookup);
INIT_LIST_HEAD(&r->res_grantqueue);
INIT_LIST_HEAD(&r->res_convertqueue);
@@ -400,14 +402,31 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
return 0;
}
-static int search_rsb_list(struct list_head *head, char *name, int len,
+static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
+{
+ char maxname[DLM_RESNAME_MAXLEN];
+
+ memset(maxname, 0, DLM_RESNAME_MAXLEN);
+ memcpy(maxname, name, nlen);
+ return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
+}
+
+static int search_rsb_tree(struct rb_root *tree, char *name, int len,
unsigned int flags, struct dlm_rsb **r_ret)
{
+ struct rb_node *node = tree->rb_node;
struct dlm_rsb *r;
int error = 0;
-
- list_for_each_entry(r, head, res_hashchain) {
- if (len == r->res_length && !memcmp(name, r->res_name, len))
+ int rc;
+
+ while (node) {
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
+ rc = rsb_cmp(r, name, len);
+ if (rc < 0)
+ node = node->rb_left;
+ else if (rc > 0)
+ node = node->rb_right;
+ else
goto found;
}
*r_ret = NULL;
@@ -420,22 +439,54 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
return error;
}
+static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
+{
+ struct rb_node **newn = &tree->rb_node;
+ struct rb_node *parent = NULL;
+ int rc;
+
+ while (*newn) {
+ struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
+ res_hashnode);
+
+ parent = *newn;
+ rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
+ if (rc < 0)
+ newn = &parent->rb_left;
+ else if (rc > 0)
+ newn = &parent->rb_right;
+ else {
+ log_print("rsb_insert match");
+ dlm_dump_rsb(rsb);
+ dlm_dump_rsb(cur);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&rsb->res_hashnode, parent, newn);
+ rb_insert_color(&rsb->res_hashnode, tree);
+ return 0;
+}
+
static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
unsigned int flags, struct dlm_rsb **r_ret)
{
struct dlm_rsb *r;
int error;
- error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+ error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);
if (!error) {
kref_get(&r->res_ref);
goto out;
}
- error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+ error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
if (error)
goto out;
- list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+ error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ if (error)
+ return error;
if (dlm_no_directory(ls))
goto out;
@@ -527,8 +578,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
nodeid = 0;
r->res_nodeid = nodeid;
}
- list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
- error = 0;
+ error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep);
out_unlock:
spin_unlock(&ls->ls_rsbtbl[bucket].lock);
out:
@@ -556,7 +606,8 @@ static void toss_rsb(struct kref *kref)
DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
kref_init(&r->res_ref);
- list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
+ rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
r->res_toss_time = jiffies;
if (r->res_lvbptr) {
dlm_free_lvb(r->res_lvbptr);
@@ -1082,19 +1133,19 @@ static void dir_remove(struct dlm_rsb *r)
r->res_name, r->res_length);
}
-/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
- found since they are in order of newest to oldest? */
+/* FIXME: make this more efficient */
static int shrink_bucket(struct dlm_ls *ls, int b)
{
+ struct rb_node *n;
struct dlm_rsb *r;
int count = 0, found;
for (;;) {
found = 0;
spin_lock(&ls->ls_rsbtbl[b].lock);
- list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
- res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
if (!time_after_eq(jiffies, r->res_toss_time +
dlm_config.ci_toss_secs * HZ))
continue;
@@ -1108,7 +1159,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
}
if (kref_put(&r->res_ref, kill_rsb)) {
- list_del(&r->res_hashchain);
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
spin_unlock(&ls->ls_rsbtbl[b].lock);
if (is_master(r))
@@ -4441,10 +4492,12 @@ int dlm_purge_locks(struct dlm_ls *ls)
static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
{
+ struct rb_node *n;
struct dlm_rsb *r, *r_ret = NULL;
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
if (!rsb_flag(r, RSB_LOCKS_PURGED))
continue;
hold_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a1d8f1af144b..a1ea25face82 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -386,12 +386,15 @@ static void threads_stop(void)
dlm_lowcomms_stop();
}
-static int new_lockspace(const char *name, int namelen, void **lockspace,
- uint32_t flags, int lvblen)
+static int new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops, void *ops_arg,
+ int *ops_result, dlm_lockspace_t **lockspace)
{
struct dlm_ls *ls;
int i, size, error;
int do_unreg = 0;
+ int namelen = strlen(name);
if (namelen > DLM_LOCKSPACE_LEN)
return -EINVAL;
@@ -403,8 +406,24 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
return -EINVAL;
if (!dlm_user_daemon_available()) {
- module_put(THIS_MODULE);
- return -EUNATCH;
+ log_print("dlm user daemon not available");
+ error = -EUNATCH;
+ goto out;
+ }
+
+ if (ops && ops_result) {
+ if (!dlm_config.ci_recover_callbacks)
+ *ops_result = -EOPNOTSUPP;
+ else
+ *ops_result = 0;
+ }
+
+ if (dlm_config.ci_recover_callbacks && cluster &&
+ strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
+ log_print("dlm cluster name %s mismatch %s",
+ dlm_config.ci_cluster_name, cluster);
+ error = -EBADR;
+ goto out;
}
error = 0;
@@ -442,6 +461,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
ls->ls_flags = 0;
ls->ls_scan_time = jiffies;
+ if (ops && dlm_config.ci_recover_callbacks) {
+ ls->ls_ops = ops;
+ ls->ls_ops_arg = ops_arg;
+ }
+
if (flags & DLM_LSFL_TIMEWARN)
set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -457,8 +481,8 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
if (!ls->ls_rsbtbl)
goto out_lsfree;
for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
- INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+ ls->ls_rsbtbl[i].keep.rb_node = NULL;
+ ls->ls_rsbtbl[i].toss.rb_node = NULL;
spin_lock_init(&ls->ls_rsbtbl[i].lock);
}
@@ -525,6 +549,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
if (!ls->ls_recover_buf)
goto out_dirfree;
+ ls->ls_slot = 0;
+ ls->ls_num_slots = 0;
+ ls->ls_slots_size = 0;
+ ls->ls_slots = NULL;
+
INIT_LIST_HEAD(&ls->ls_recover_list);
spin_lock_init(&ls->ls_recover_list_lock);
ls->ls_recover_list_count = 0;
@@ -614,8 +643,10 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
return error;
}
-int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
- uint32_t flags, int lvblen)
+int dlm_new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops, void *ops_arg,
+ int *ops_result, dlm_lockspace_t **lockspace)
{
int error = 0;
@@ -625,7 +656,8 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
if (error)
goto out;
- error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+ error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg,
+ ops_result, lockspace);
if (!error)
ls_count++;
if (error > 0)
@@ -685,7 +717,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
static int release_lockspace(struct dlm_ls *ls, int force)
{
struct dlm_rsb *rsb;
- struct list_head *head;
+ struct rb_node *n;
int i, busy, rv;
busy = lockspace_busy(ls, force);
@@ -746,20 +778,15 @@ static int release_lockspace(struct dlm_ls *ls, int force)
*/
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
- head = &ls->ls_rsbtbl[i].list;
- while (!list_empty(head)) {
- rsb = list_entry(head->next, struct dlm_rsb,
- res_hashchain);
-
- list_del(&rsb->res_hashchain);
+ while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) {
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ rb_erase(n, &ls->ls_rsbtbl[i].keep);
dlm_free_rsb(rsb);
}
- head = &ls->ls_rsbtbl[i].toss;
- while (!list_empty(head)) {
- rsb = list_entry(head->next, struct dlm_rsb,
- res_hashchain);
- list_del(&rsb->res_hashchain);
+ while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) {
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ rb_erase(n, &ls->ls_rsbtbl[i].toss);
dlm_free_rsb(rsb);
}
}
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b12532e553f8..862640a36d5c 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,280 @@
#include "config.h"
#include "lowcomms.h"
+int dlm_slots_version(struct dlm_header *h)
+{
+ if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
+ return 0;
+ return 1;
+}
+
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+ struct dlm_member *memb)
+{
+ struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+
+ if (!dlm_slots_version(&rc->rc_header))
+ return;
+
+ memb->slot = le16_to_cpu(rf->rf_our_slot);
+ memb->generation = le32_to_cpu(rf->rf_generation);
+}
+
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct dlm_slot *slot;
+ struct rcom_slot *ro;
+ int i;
+
+ ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+ /* ls_slots array is sparse, but not rcom_slots */
+
+ for (i = 0; i < ls->ls_slots_size; i++) {
+ slot = &ls->ls_slots[i];
+ if (!slot->nodeid)
+ continue;
+ ro->ro_nodeid = cpu_to_le32(slot->nodeid);
+ ro->ro_slot = cpu_to_le16(slot->slot);
+ ro++;
+ }
+}
+
+#define SLOT_DEBUG_LINE 128
+
+static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+ struct rcom_slot *ro0, struct dlm_slot *array,
+ int array_size)
+{
+ char line[SLOT_DEBUG_LINE];
+ int len = SLOT_DEBUG_LINE - 1;
+ int pos = 0;
+ int ret, i;
+
+ if (!dlm_config.ci_log_debug)
+ return;
+
+ memset(line, 0, sizeof(line));
+
+ if (array) {
+ for (i = 0; i < array_size; i++) {
+ if (!array[i].nodeid)
+ continue;
+
+ ret = snprintf(line + pos, len - pos, " %d:%d",
+ array[i].slot, array[i].nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ } else if (ro0) {
+ for (i = 0; i < num_slots; i++) {
+ ret = snprintf(line + pos, len - pos, " %d:%d",
+ ro0[i].ro_slot, ro0[i].ro_nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ }
+
+ log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
+}
+
+int dlm_slots_copy_in(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_rcom *rc = ls->ls_recover_buf;
+ struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+ struct rcom_slot *ro0, *ro;
+ int our_nodeid = dlm_our_nodeid();
+ int i, num_slots;
+ uint32_t gen;
+
+ if (!dlm_slots_version(&rc->rc_header))
+ return -1;
+
+ gen = le32_to_cpu(rf->rf_generation);
+ if (gen <= ls->ls_generation) {
+ log_error(ls, "dlm_slots_copy_in gen %u old %u",
+ gen, ls->ls_generation);
+ }
+ ls->ls_generation = gen;
+
+ num_slots = le16_to_cpu(rf->rf_num_slots);
+ if (!num_slots)
+ return -1;
+
+ ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+ for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+ ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
+ ro->ro_slot = le16_to_cpu(ro->ro_slot);
+ }
+
+ log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+ if (ro->ro_nodeid != memb->nodeid)
+ continue;
+ memb->slot = ro->ro_slot;
+ memb->slot_prev = memb->slot;
+ break;
+ }
+
+ if (memb->nodeid == our_nodeid) {
+ if (ls->ls_slot && ls->ls_slot != memb->slot) {
+ log_error(ls, "dlm_slots_copy_in our slot "
+ "changed %d %d", ls->ls_slot,
+ memb->slot);
+ return -1;
+ }
+
+ if (!ls->ls_slot)
+ ls->ls_slot = memb->slot;
+ }
+
+ if (!memb->slot) {
+ log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
+ memb->nodeid);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* for any nodes that do not support slots, we will not have set memb->slot
+ in wait_status_all(), so memb->slot will remain -1, and we will not
+ assign slots or set ls_num_slots here */
+
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+ struct dlm_slot **slots_out, uint32_t *gen_out)
+{
+ struct dlm_member *memb;
+ struct dlm_slot *array;
+ int our_nodeid = dlm_our_nodeid();
+ int array_size, max_slots, i;
+ int need = 0;
+ int max = 0;
+ int num = 0;
+ uint32_t gen = 0;
+
+ /* our own memb struct will have slot -1 gen 0 */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->nodeid == our_nodeid) {
+ memb->slot = ls->ls_slot;
+ memb->generation = ls->ls_generation;
+ break;
+ }
+ }
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->generation > gen)
+ gen = memb->generation;
+
+ /* node doesn't support slots */
+
+ if (memb->slot == -1)
+ return -1;
+
+ /* node needs a slot assigned */
+
+ if (!memb->slot)
+ need++;
+
+ /* node has a slot assigned */
+
+ num++;
+
+ if (!max || max < memb->slot)
+ max = memb->slot;
+
+ /* sanity check, once slot is assigned it shouldn't change */
+
+ if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
+ log_error(ls, "nodeid %d slot changed %d %d",
+ memb->nodeid, memb->slot_prev, memb->slot);
+ return -1;
+ }
+ memb->slot_prev = memb->slot;
+ }
+
+ array_size = max + need;
+
+ array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
+ if (!array)
+ return -ENOMEM;
+
+ num = 0;
+
+ /* fill in slots (offsets) that are used */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (!memb->slot)
+ continue;
+
+ if (memb->slot > array_size) {
+ log_error(ls, "invalid slot number %d", memb->slot);
+ kfree(array);
+ return -1;
+ }
+
+ array[memb->slot - 1].nodeid = memb->nodeid;
+ array[memb->slot - 1].slot = memb->slot;
+ num++;
+ }
+
+ /* assign new slots from unused offsets */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->slot)
+ continue;
+
+ for (i = 0; i < array_size; i++) {
+ if (array[i].nodeid)
+ continue;
+
+ memb->slot = i + 1;
+ memb->slot_prev = memb->slot;
+ array[i].nodeid = memb->nodeid;
+ array[i].slot = memb->slot;
+ num++;
+
+ if (!ls->ls_slot && memb->nodeid == our_nodeid)
+ ls->ls_slot = memb->slot;
+ break;
+ }
+
+ if (!memb->slot) {
+ log_error(ls, "no free slot found");
+ kfree(array);
+ return -1;
+ }
+ }
+
+ gen++;
+
+ log_debug_slots(ls, gen, num, NULL, array, array_size);
+
+ max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
+ sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
+
+ if (num > max_slots) {
+ log_error(ls, "num_slots %d exceeds max_slots %d",
+ num, max_slots);
+ kfree(array);
+ return -1;
+ }
+
+ *gen_out = gen;
+ *slots_out = array;
+ *slots_size = array_size;
+ *num_slots = num;
+ return 0;
+}
+
static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
{
struct dlm_member *memb = NULL;
@@ -43,59 +317,51 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
}
}
-static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
{
struct dlm_member *memb;
- int w, error;
+ int error;
memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
if (!memb)
return -ENOMEM;
- w = dlm_node_weight(ls->ls_name, nodeid);
- if (w < 0) {
- kfree(memb);
- return w;
- }
-
- error = dlm_lowcomms_connect_node(nodeid);
+ error = dlm_lowcomms_connect_node(node->nodeid);
if (error < 0) {
kfree(memb);
return error;
}
- memb->nodeid = nodeid;
- memb->weight = w;
+ memb->nodeid = node->nodeid;
+ memb->weight = node->weight;
+ memb->comm_seq = node->comm_seq;
add_ordered_member(ls, memb);
ls->ls_num_nodes++;
return 0;
}
-static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
-{
- list_move(&memb->list, &ls->ls_nodes_gone);
- ls->ls_num_nodes--;
-}
-
-int dlm_is_member(struct dlm_ls *ls, int nodeid)
+static struct dlm_member *find_memb(struct list_head *head, int nodeid)
{
struct dlm_member *memb;
- list_for_each_entry(memb, &ls->ls_nodes, list) {
+ list_for_each_entry(memb, head, list) {
if (memb->nodeid == nodeid)
- return 1;
+ return memb;
}
+ return NULL;
+}
+
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+ if (find_memb(&ls->ls_nodes, nodeid))
+ return 1;
return 0;
}
int dlm_is_removed(struct dlm_ls *ls, int nodeid)
{
- struct dlm_member *memb;
-
- list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
- if (memb->nodeid == nodeid)
- return 1;
- }
+ if (find_memb(&ls->ls_nodes_gone, nodeid))
+ return 1;
return 0;
}
@@ -176,7 +442,7 @@ static int ping_members(struct dlm_ls *ls)
error = dlm_recovery_stopped(ls);
if (error)
break;
- error = dlm_rcom_status(ls, memb->nodeid);
+ error = dlm_rcom_status(ls, memb->nodeid, 0);
if (error)
break;
}
@@ -186,10 +452,88 @@ static int ping_members(struct dlm_ls *ls)
return error;
}
+static void dlm_lsop_recover_prep(struct dlm_ls *ls)
+{
+ if (!ls->ls_ops || !ls->ls_ops->recover_prep)
+ return;
+ ls->ls_ops->recover_prep(ls->ls_ops_arg);
+}
+
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+{
+ struct dlm_slot slot;
+ uint32_t seq;
+ int error;
+
+ if (!ls->ls_ops || !ls->ls_ops->recover_slot)
+ return;
+
+ /* if there is no comms connection with this node
+ or the present comms connection is newer
+ than the one when this member was added, then
+ we consider the node to have failed (versus
+ being removed due to dlm_release_lockspace) */
+
+ error = dlm_comm_seq(memb->nodeid, &seq);
+
+ if (!error && seq == memb->comm_seq)
+ return;
+
+ slot.nodeid = memb->nodeid;
+ slot.slot = memb->slot;
+
+ ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
+}
+
+void dlm_lsop_recover_done(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_slot *slots;
+ int i, num;
+
+ if (!ls->ls_ops || !ls->ls_ops->recover_done)
+ return;
+
+ num = ls->ls_num_nodes;
+
+ slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
+ if (!slots)
+ return;
+
+ i = 0;
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (i == num) {
+ log_error(ls, "dlm_lsop_recover_done bad num %d", num);
+ goto out;
+ }
+ slots[i].nodeid = memb->nodeid;
+ slots[i].slot = memb->slot;
+ i++;
+ }
+
+ ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
+ ls->ls_slot, ls->ls_generation);
+ out:
+ kfree(slots);
+}
+
+static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
+ int nodeid)
+{
+ int i;
+
+ for (i = 0; i < rv->nodes_count; i++) {
+ if (rv->nodes[i].nodeid == nodeid)
+ return &rv->nodes[i];
+ }
+ return NULL;
+}
+
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
{
struct dlm_member *memb, *safe;
- int i, error, found, pos = 0, neg = 0, low = -1;
+ struct dlm_config_node *node;
+ int i, error, neg = 0, low = -1;
/* previously removed members that we've not finished removing need to
count as a negative change so the "neg" recovery steps will happen */
@@ -202,46 +546,32 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
/* move departed members from ls_nodes to ls_nodes_gone */
list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
- found = 0;
- for (i = 0; i < rv->node_count; i++) {
- if (memb->nodeid == rv->nodeids[i]) {
- found = 1;
- break;
- }
- }
+ node = find_config_node(rv, memb->nodeid);
+ if (node && !node->new)
+ continue;
- if (!found) {
- neg++;
- dlm_remove_member(ls, memb);
+ if (!node) {
log_debug(ls, "remove member %d", memb->nodeid);
+ } else {
+ /* removed and re-added */
+ log_debug(ls, "remove member %d comm_seq %u %u",
+ memb->nodeid, memb->comm_seq, node->comm_seq);
}
- }
-
- /* Add an entry to ls_nodes_gone for members that were removed and
- then added again, so that previous state for these nodes will be
- cleared during recovery. */
-
- for (i = 0; i < rv->new_count; i++) {
- if (!dlm_is_member(ls, rv->new[i]))
- continue;
- log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
- memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
- if (!memb)
- return -ENOMEM;
- memb->nodeid = rv->new[i];
- list_add_tail(&memb->list, &ls->ls_nodes_gone);
neg++;
+ list_move(&memb->list, &ls->ls_nodes_gone);
+ ls->ls_num_nodes--;
+ dlm_lsop_recover_slot(ls, memb);
}
/* add new members to ls_nodes */
- for (i = 0; i < rv->node_count; i++) {
- if (dlm_is_member(ls, rv->nodeids[i]))
+ for (i = 0; i < rv->nodes_count; i++) {
+ node = &rv->nodes[i];
+ if (dlm_is_member(ls, node->nodeid))
continue;
- dlm_add_member(ls, rv->nodeids[i]);
- pos++;
- log_debug(ls, "add member %d", rv->nodeids[i]);
+ dlm_add_member(ls, node);
+ log_debug(ls, "add member %d", node->nodeid);
}
list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -251,7 +581,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
ls->ls_low_nodeid = low;
make_member_array(ls);
- dlm_set_recover_status(ls, DLM_RS_NODES);
*neg_out = neg;
error = ping_members(ls);
@@ -261,12 +590,8 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
ls->ls_members_result = error;
complete(&ls->ls_members_done);
}
- if (error)
- goto out;
- error = dlm_recover_members_wait(ls);
- out:
- log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+ log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
@@ -327,26 +652,35 @@ int dlm_ls_stop(struct dlm_ls *ls)
*/
dlm_recoverd_suspend(ls);
+
+ spin_lock(&ls->ls_recover_lock);
+ kfree(ls->ls_slots);
+ ls->ls_slots = NULL;
+ ls->ls_num_slots = 0;
+ ls->ls_slots_size = 0;
ls->ls_recover_status = 0;
+ spin_unlock(&ls->ls_recover_lock);
+
dlm_recoverd_resume(ls);
if (!ls->ls_recover_begin)
ls->ls_recover_begin = jiffies;
+
+ dlm_lsop_recover_prep(ls);
return 0;
}
int dlm_ls_start(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL, *rv_old;
- int *ids = NULL, *new = NULL;
- int error, ids_count = 0, new_count = 0;
+ struct dlm_config_node *nodes;
+ int error, count;
rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
if (!rv)
return -ENOMEM;
- error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
- &new, &new_count);
+ error = dlm_config_nodes(ls->ls_name, &nodes, &count);
if (error < 0)
goto fail;
@@ -361,10 +695,8 @@ int dlm_ls_start(struct dlm_ls *ls)
goto fail;
}
- rv->nodeids = ids;
- rv->node_count = ids_count;
- rv->new = new;
- rv->new_count = new_count;
+ rv->nodes = nodes;
+ rv->nodes_count = count;
rv->seq = ++ls->ls_recover_seq;
rv_old = ls->ls_recover_args;
ls->ls_recover_args = rv;
@@ -372,9 +704,8 @@ int dlm_ls_start(struct dlm_ls *ls)
if (rv_old) {
log_error(ls, "unused recovery %llx %d",
- (unsigned long long)rv_old->seq, rv_old->node_count);
- kfree(rv_old->nodeids);
- kfree(rv_old->new);
+ (unsigned long long)rv_old->seq, rv_old->nodes_count);
+ kfree(rv_old->nodes);
kfree(rv_old);
}
@@ -383,8 +714,7 @@ int dlm_ls_start(struct dlm_ls *ls)
fail:
kfree(rv);
- kfree(ids);
- kfree(new);
+ kfree(nodes);
return error;
}
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 7a26fca1e0b5..3deb70661c69 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -20,6 +20,14 @@ void dlm_clear_members_gone(struct dlm_ls *ls);
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
int dlm_is_removed(struct dlm_ls *ls, int nodeid);
int dlm_is_member(struct dlm_ls *ls, int nodeid);
+int dlm_slots_version(struct dlm_header *h);
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+ struct dlm_member *memb);
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_slots_copy_in(struct dlm_ls *ls);
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+ struct dlm_slot **slots_out, uint32_t *gen_out);
+void dlm_lsop_recover_done(struct dlm_ls *ls);
#endif /* __MEMBER_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f10a50f24e8f..ac5c616c9696 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -23,6 +23,7 @@
#include "memory.h"
#include "lock.h"
#include "util.h"
+#include "member.h"
static int rcom_response(struct dlm_ls *ls)
@@ -72,20 +73,30 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
dlm_lowcomms_commit_buffer(mh);
}
+static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
+ uint32_t flags)
+{
+ rs->rs_flags = cpu_to_le32(flags);
+}
+
/* When replying to a status request, a node also sends back its
configuration values. The requesting node then checks that the remote
node is configured the same way as itself. */
-static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
+ uint32_t num_slots)
{
rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
+
+ rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
+ rf->rf_num_slots = cpu_to_le16(num_slots);
+ rf->rf_generation = cpu_to_le32(ls->ls_generation);
}
-static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
{
struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
- size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
log_error(ls, "version mismatch: %x nodeid %d: %x",
@@ -94,12 +105,6 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
return -EPROTO;
}
- if (rc->rc_header.h_length < conf_size) {
- log_error(ls, "config too short: %d nodeid %d",
- rc->rc_header.h_length, nodeid);
- return -EPROTO;
- }
-
if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -127,7 +132,18 @@ static void disallow_sync_reply(struct dlm_ls *ls)
spin_unlock(&ls->ls_rcom_spin);
}
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+/*
+ * low nodeid gathers one slot value at a time from each node.
+ * it sets need_slots=0, and saves rf_our_slot returned from each
+ * rcom_config.
+ *
+ * other nodes gather all slot values at once from the low nodeid.
+ * they set need_slots=1, and ignore the rf_our_slot returned from each
+ * rcom_config. they use the rf_num_slots returned from the low
+ * node's rcom_config.
+ */
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
@@ -141,10 +157,13 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
goto out;
}
- error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+ error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
+ sizeof(struct rcom_status), &rc, &mh);
if (error)
goto out;
+ set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
+
allow_sync_reply(ls, &rc->rc_id);
memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
@@ -161,8 +180,11 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
/* we pretend the remote lockspace exists with 0 status */
log_debug(ls, "remote node %d not ready", nodeid);
rc->rc_result = 0;
- } else
- error = check_config(ls, rc, nodeid);
+ error = 0;
+ } else {
+ error = check_rcom_config(ls, rc, nodeid);
+ }
+
/* the caller looks at rc_result for the remote recovery status */
out:
return error;
@@ -172,17 +194,60 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
- int error, nodeid = rc_in->rc_header.h_nodeid;
+ struct rcom_status *rs;
+ uint32_t status;
+ int nodeid = rc_in->rc_header.h_nodeid;
+ int len = sizeof(struct rcom_config);
+ int num_slots = 0;
+ int error;
+
+ if (!dlm_slots_version(&rc_in->rc_header)) {
+ status = dlm_recover_status(ls);
+ goto do_create;
+ }
+
+ rs = (struct rcom_status *)rc_in->rc_buf;
+ if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
+ status = dlm_recover_status(ls);
+ goto do_create;
+ }
+
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ num_slots = ls->ls_num_slots;
+ spin_unlock(&ls->ls_recover_lock);
+ len += num_slots * sizeof(struct rcom_slot);
+
+ do_create:
error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
- sizeof(struct rcom_config), &rc, &mh);
+ len, &rc, &mh);
if (error)
return;
+
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- rc->rc_result = dlm_recover_status(ls);
- make_config(ls, (struct rcom_config *) rc->rc_buf);
+ rc->rc_result = status;
+
+ set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
+
+ if (!num_slots)
+ goto do_send;
+
+ spin_lock(&ls->ls_recover_lock);
+ if (ls->ls_num_slots != num_slots) {
+ spin_unlock(&ls->ls_recover_lock);
+ log_debug(ls, "receive_rcom_status num_slots %d to %d",
+ num_slots, ls->ls_num_slots);
+ rc->rc_result = 0;
+ set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
+ goto do_send;
+ }
+
+ dlm_slots_copy_out(ls, rc);
+ spin_unlock(&ls->ls_recover_lock);
+ do_send:
send_rcom(ls, mh, rc);
}
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index b09abd29ba38..206723ab744d 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -14,7 +14,7 @@
#ifndef __RCOM_DOT_H__
#define __RCOM_DOT_H__
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 14638235f7b2..34d5adf1fce7 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -85,14 +85,20 @@ uint32_t dlm_recover_status(struct dlm_ls *ls)
return status;
}
+static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+ ls->ls_recover_status |= status;
+}
+
void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
{
spin_lock(&ls->ls_recover_lock);
- ls->ls_recover_status |= status;
+ _set_recover_status(ls, status);
spin_unlock(&ls->ls_recover_lock);
}
-static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
+ int save_slots)
{
struct dlm_rcom *rc = ls->ls_recover_buf;
struct dlm_member *memb;
@@ -106,10 +112,13 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
goto out;
}
- error = dlm_rcom_status(ls, memb->nodeid);
+ error = dlm_rcom_status(ls, memb->nodeid, 0);
if (error)
goto out;
+ if (save_slots)
+ dlm_slot_save(ls, rc, memb);
+
if (rc->rc_result & wait_status)
break;
if (delay < 1000)
@@ -121,7 +130,8 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
return error;
}
-static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
+ uint32_t status_flags)
{
struct dlm_rcom *rc = ls->ls_recover_buf;
int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
@@ -132,7 +142,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
goto out;
}
- error = dlm_rcom_status(ls, nodeid);
+ error = dlm_rcom_status(ls, nodeid, status_flags);
if (error)
break;
@@ -152,18 +162,56 @@ static int wait_status(struct dlm_ls *ls, uint32_t status)
int error;
if (ls->ls_low_nodeid == dlm_our_nodeid()) {
- error = wait_status_all(ls, status);
+ error = wait_status_all(ls, status, 0);
if (!error)
dlm_set_recover_status(ls, status_all);
} else
- error = wait_status_low(ls, status_all);
+ error = wait_status_low(ls, status_all, 0);
return error;
}
int dlm_recover_members_wait(struct dlm_ls *ls)
{
- return wait_status(ls, DLM_RS_NODES);
+ struct dlm_member *memb;
+ struct dlm_slot *slots;
+ int num_slots, slots_size;
+ int error, rv;
+ uint32_t gen;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ memb->slot = -1;
+ memb->generation = 0;
+ }
+
+ if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+ error = wait_status_all(ls, DLM_RS_NODES, 1);
+ if (error)
+ goto out;
+
+ /* slots array is sparse, slots_size may be > num_slots */
+
+ rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
+ if (!rv) {
+ spin_lock(&ls->ls_recover_lock);
+ _set_recover_status(ls, DLM_RS_NODES_ALL);
+ ls->ls_num_slots = num_slots;
+ ls->ls_slots_size = slots_size;
+ ls->ls_slots = slots;
+ ls->ls_generation = gen;
+ spin_unlock(&ls->ls_recover_lock);
+ } else {
+ dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
+ }
+ } else {
+ error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
+ if (error)
+ goto out;
+
+ dlm_slots_copy_in(ls);
+ }
+ out:
+ return error;
}
int dlm_recover_directory_wait(struct dlm_ls *ls)
@@ -542,8 +590,6 @@ int dlm_recover_locks(struct dlm_ls *ls)
out:
if (error)
recover_list_clear(ls);
- else
- dlm_set_recover_status(ls, DLM_RS_LOCKS);
return error;
}
@@ -715,6 +761,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
int dlm_create_root_list(struct dlm_ls *ls)
{
+ struct rb_node *n;
struct dlm_rsb *r;
int i, error = 0;
@@ -727,7 +774,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
spin_lock(&ls->ls_rsbtbl[i].lock);
- list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
list_add(&r->res_root_list, &ls->ls_root_list);
dlm_hold_rsb(r);
}
@@ -741,7 +789,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
continue;
}
- list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
list_add(&r->res_root_list, &ls->ls_root_list);
dlm_hold_rsb(r);
}
@@ -771,16 +820,18 @@ void dlm_release_root_list(struct dlm_ls *ls)
void dlm_clear_toss_list(struct dlm_ls *ls)
{
- struct dlm_rsb *r, *safe;
+ struct rb_node *n, *next;
+ struct dlm_rsb *rsb;
int i;
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
spin_lock(&ls->ls_rsbtbl[i].lock);
- list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
- res_hashchain) {
- if (dlm_no_directory(ls) || !is_master(r)) {
- list_del(&r->res_hashchain);
- dlm_free_rsb(r);
+ for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
+ next = rb_next(n);;
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ if (dlm_no_directory(ls) || !is_master(rsb)) {
+ rb_erase(n, &ls->ls_rsbtbl[i].toss);
+ dlm_free_rsb(rsb);
}
}
spin_unlock(&ls->ls_rsbtbl[i].lock);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 774da3cf92c6..3780caf7ae0c 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
unsigned long start;
int error, neg = 0;
- log_debug(ls, "recover %llx", (unsigned long long)rv->seq);
+ log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq);
mutex_lock(&ls->ls_recoverd_active);
@@ -76,14 +76,22 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
/*
* Add or remove nodes from the lockspace's ls_nodes list.
- * Also waits for all nodes to complete dlm_recover_members.
*/
error = dlm_recover_members(ls, rv, &neg);
if (error) {
- log_debug(ls, "recover_members failed %d", error);
+ log_debug(ls, "dlm_recover_members error %d", error);
goto fail;
}
+
+ dlm_set_recover_status(ls, DLM_RS_NODES);
+
+ error = dlm_recover_members_wait(ls);
+ if (error) {
+ log_debug(ls, "dlm_recover_members_wait error %d", error);
+ goto fail;
+ }
+
start = jiffies;
/*
@@ -93,17 +101,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_directory(ls);
if (error) {
- log_debug(ls, "recover_directory failed %d", error);
+ log_debug(ls, "dlm_recover_directory error %d", error);
goto fail;
}
- /*
- * Wait for all nodes to complete directory rebuild.
- */
+ dlm_set_recover_status(ls, DLM_RS_DIR);
error = dlm_recover_directory_wait(ls);
if (error) {
- log_debug(ls, "recover_directory_wait failed %d", error);
+ log_debug(ls, "dlm_recover_directory_wait error %d", error);
goto fail;
}
@@ -133,7 +139,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_masters(ls);
if (error) {
- log_debug(ls, "recover_masters failed %d", error);
+ log_debug(ls, "dlm_recover_masters error %d", error);
goto fail;
}
@@ -143,13 +149,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_locks(ls);
if (error) {
- log_debug(ls, "recover_locks failed %d", error);
+ log_debug(ls, "dlm_recover_locks error %d", error);
goto fail;
}
+ dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
error = dlm_recover_locks_wait(ls);
if (error) {
- log_debug(ls, "recover_locks_wait failed %d", error);
+ log_debug(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
}
@@ -170,7 +178,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_locks_wait(ls);
if (error) {
- log_debug(ls, "recover_locks_wait failed %d", error);
+ log_debug(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
}
}
@@ -186,9 +194,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_purge_requestqueue(ls);
dlm_set_recover_status(ls, DLM_RS_DONE);
+
error = dlm_recover_done_wait(ls);
if (error) {
- log_debug(ls, "recover_done_wait failed %d", error);
+ log_debug(ls, "dlm_recover_done_wait error %d", error);
goto fail;
}
@@ -200,34 +209,35 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = enable_locking(ls, rv->seq);
if (error) {
- log_debug(ls, "enable_locking failed %d", error);
+ log_debug(ls, "enable_locking error %d", error);
goto fail;
}
error = dlm_process_requestqueue(ls);
if (error) {
- log_debug(ls, "process_requestqueue failed %d", error);
+ log_debug(ls, "dlm_process_requestqueue error %d", error);
goto fail;
}
error = dlm_recover_waiters_post(ls);
if (error) {
- log_debug(ls, "recover_waiters_post failed %d", error);
+ log_debug(ls, "dlm_recover_waiters_post error %d", error);
goto fail;
}
dlm_grant_after_purge(ls);
- log_debug(ls, "recover %llx done: %u ms",
- (unsigned long long)rv->seq,
+ log_debug(ls, "dlm_recover %llx generation %u done: %u ms",
+ (unsigned long long)rv->seq, ls->ls_generation,
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);
+ dlm_lsop_recover_done(ls);
return 0;
fail:
dlm_release_root_list(ls);
- log_debug(ls, "recover %llx error %d",
+ log_debug(ls, "dlm_recover %llx error %d",
(unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);
return error;
@@ -250,8 +260,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
if (rv) {
ls_recover(ls, rv);
- kfree(rv->nodeids);
- kfree(rv->new);
+ kfree(rv->nodes);
kfree(rv);
}
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d8ea60756403..eb4ed9ba3098 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -392,8 +392,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = dlm_new_lockspace(params->name, strlen(params->name),
- &lockspace, params->flags, DLM_USER_LVB_LEN);
+ error = dlm_new_lockspace(params->name, NULL, params->flags,
+ DLM_USER_LVB_LEN, NULL, NULL, NULL,
+ &lockspace);
if (error)
return error;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23a..aabdfc38cf24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
/* The user that created the eventpoll descriptor */
struct user_struct *user;
+
+ struct file *file;
+
+ /* used to optimize loop detection check */
+ int visited;
+ struct list_head visited_list_link;
};
/* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
+
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
};
#endif /* CONFIG_SYSCTL */
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+ return f->f_op == &eventpoll_fops;
+}
/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
.llseek = noop_llseek,
};
-/* Fast test to see if the file is an eventpoll file */
-static inline int is_file_epoll(struct file *f)
-{
- return f->f_op == &eventpoll_fops;
-}
-
/*
* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
rb_insert_color(&epi->rbn, &ep->rbr);
}
+
+
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+ if (++path_count[nests] > path_limits[nests])
+ return -1;
+ return 0;
+}
+
+static void path_count_init(void)
+{
+ int i;
+
+ for (i = 0; i < PATH_ARR_SIZE; i++)
+ path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct file *child_file;
+ struct epitem *epi;
+
+ list_for_each_entry(epi, &file->f_ep_links, fllink) {
+ child_file = epi->ep->file;
+ if (is_file_epoll(child_file)) {
+ if (list_empty(&child_file->f_ep_links)) {
+ if (path_count_inc(call_nests)) {
+ error = -1;
+ break;
+ }
+ } else {
+ error = ep_call_nested(&poll_loop_ncalls,
+ EP_MAX_NESTS,
+ reverse_path_check_proc,
+ child_file, child_file,
+ current);
+ }
+ if (error != 0)
+ break;
+ } else {
+ printk(KERN_ERR "reverse_path_check_proc: "
+ "file is not an ep!\n");
+ }
+ }
+ return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ * links that are proposed to be newly added. We need to
+ * make sure that those added links don't add too many
+ * paths such that we will spend all our time waking up
+ * eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ * -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+ int length = 0;
+ int error = 0;
+ struct file *current_file;
+
+ /* let's call this for all tfiles */
+ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+ length++;
+ path_count_init();
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ reverse_path_check_proc, current_file,
+ current_file, current);
+ if (error)
+ break;
+ }
+ return error;
+}
+
/*
* Must be called with "mtx" held.
*/
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
*/
ep_rbtree_insert(ep, epi);
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (reverse_path_check())
+ goto error_remove_epi;
+
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
return 0;
+error_remove_epi:
+ spin_lock(&tfile->f_lock);
+ if (ep_is_linked(&epi->fllink))
+ list_del_init(&epi->fllink);
+ spin_unlock(&tfile->f_lock);
+
+ rb_erase(&epi->rbn, &ep->rbr);
+
error_unregister:
ep_unregister_pollwait(ep, epi);
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
int error = 0;
struct file *file = priv;
struct eventpoll *ep = file->private_data;
+ struct eventpoll *ep_tovisit;
struct rb_node *rbp;
struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1);
+ ep->visited = 1;
+ list_add(&ep->visited_list_link, &visited_list);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
+ ep_tovisit = epi->ffd.file->private_data;
+ if (ep_tovisit->visited)
+ continue;
error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
- ep_loop_check_proc, epi->ffd.file,
- epi->ffd.file->private_data, current);
+ ep_loop_check_proc, epi->ffd.file,
+ ep_tovisit, current);
if (error != 0)
break;
+ } else {
+ /*
+ * If we've reached a file that is not associated with
+ * an ep, then we need to check if the newly added
+ * links are going to add too many wakeup paths. We do
+ * this by adding it to the tfile_check_list, if it's
+ * not already there, and calling reverse_path_check()
+ * during ep_insert().
+ */
+ if (list_empty(&epi->ffd.file->f_tfile_llink))
+ list_add(&epi->ffd.file->f_tfile_llink,
+ &tfile_check_list);
}
}
mutex_unlock(&ep->mtx);
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
*/
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
- return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ int ret;
+ struct eventpoll *ep_cur, *ep_next;
+
+ ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
ep_loop_check_proc, file, ep, current);
+ /* clear visited list */
+ list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+ visited_list_link) {
+ ep_cur->visited = 0;
+ list_del(&ep_cur->visited_list_link);
+ }
+ return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+ struct file *file;
+
+ /* first clear the tfile_check_list */
+ while (!list_empty(&tfile_check_list)) {
+ file = list_first_entry(&tfile_check_list, struct file,
+ f_tfile_llink);
+ list_del_init(&file->f_tfile_llink);
+ }
+ INIT_LIST_HEAD(&tfile_check_list);
}
/*
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
*/
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
- int error;
+ int error, fd;
struct eventpoll *ep = NULL;
+ struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+ if (fd < 0) {
+ error = fd;
+ goto out_free_ep;
+ }
+ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
- if (error < 0)
- ep_free(ep);
-
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out_free_fd;
+ }
+ fd_install(fd, file);
+ ep->file = file;
+ return fd;
+
+out_free_fd:
+ put_unused_fd(fd);
+out_free_ep:
+ ep_free(ep);
return error;
}
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
- * better be handled here, than in more critical paths.
+ * better be handled here, than in more critical paths. While we are
+ * checking for loops we also determine the list of files reachable
+ * and hang them on the tfile_check_list, so we can check that we
+ * haven't created too many possible wakeup paths.
*
- * We hold epmutex across the loop check and the insert in this case, in
- * order to prevent two separate inserts from racing and each doing the
- * insert "at the same time" such that ep_loop_check passes on both
- * before either one does the insert, thereby creating a cycle.
+ * We need to hold the epmutex across both ep_insert and ep_remove
+ * b/c we want to make sure we are looking at a coherent view of
+ * epoll network.
*/
- if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+ if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
mutex_lock(&epmutex);
did_lock_epmutex = 1;
- error = -ELOOP;
- if (ep_loop_check(ep, tfile) != 0)
- goto error_tgt_fput;
}
-
+ if (op == EPOLL_CTL_ADD) {
+ if (is_file_epoll(tfile)) {
+ error = -ELOOP;
+ if (ep_loop_check(ep, tfile) != 0)
+ goto error_tgt_fput;
+ } else
+ list_add(&tfile->f_tfile_llink, &tfile_check_list);
+ }
mutex_lock_nested(&ep->mtx, 0);
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
+ clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
mutex_unlock(&ep->mtx);
error_tgt_fput:
- if (unlikely(did_lock_epmutex))
+ if (did_lock_epmutex)
mutex_unlock(&epmutex);
fput(tfile);
diff --git a/fs/exec.c b/fs/exec.c
index 3f64b9f26e7d..aeb135c7ff5c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
+
+#include <trace/events/task.h>
#include "internal.h"
int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
{
task_lock(tsk);
+ trace_task_rename(tsk, buf);
+
/*
* Threads may access current->comm without holding
* the task lock, so write the string carefully.
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 12ccacda44e0..f9e2cd8cf711 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,6 +23,8 @@
#include <trace/events/ext4.h>
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group);
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
@@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
* This function returns the number of file system metadata clusters at
* the beginning of a block group, including the reserved gdt blocks.
*/
-unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1554b15f91bc..513004fc3d84 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,6 +511,14 @@ struct ext4_new_group_data {
__u32 free_blocks_count;
};
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+ BLOCK_BITMAP = 0, /* block bitmap */
+ INODE_BITMAP, /* inode bitmap */
+ INODE_TABLE, /* inode tables */
+ GROUP_TABLE_COUNT,
+};
+
/*
* Flags used by ext4_map_blocks()
*/
@@ -575,6 +583,7 @@ struct ext4_new_group_data {
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -957,12 +966,13 @@ struct ext4_inode_info {
#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
EXT4_MOUNT2_##opt)
-#define ext4_set_bit __test_and_set_bit_le
+#define ext4_test_and_set_bit __test_and_set_bit_le
+#define ext4_set_bit __set_bit_le
#define ext4_set_bit_atomic ext2_set_bit_atomic
-#define ext4_clear_bit __test_and_clear_bit_le
+#define ext4_test_and_clear_bit __test_and_clear_bit_le
+#define ext4_clear_bit __clear_bit_le
#define ext4_clear_bit_atomic ext2_clear_bit_atomic
#define ext4_test_bit test_bit_le
-#define ext4_find_first_zero_bit find_first_zero_bit_le
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
@@ -1397,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1409,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1790,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp);
-extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
- ext4_group_t block_group);
extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp);
@@ -1880,16 +1891,9 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length);
extern int ext4_discard_partial_page_buffers(handle_t *handle,
struct address_space *mapping, loff_t from,
loff_t length, int flags);
-extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1924,6 +1928,7 @@ extern int ext4_group_add(struct super_block *sb,
extern int ext4_group_extend(struct super_block *sb,
struct ext4_super_block *es,
ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 841faf5fb785..74f23c292e1b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3280,6 +3280,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t i, pg_lblk;
pgoff_t index;
+ if (!test_opt(inode->i_sb, DELALLOC))
+ return 0;
+
/* reverse search wont work if fs block size is less than page size */
if (inode->i_blkbits < PAGE_CACHE_SHIFT)
search_hint_reverse = 0;
@@ -3452,8 +3455,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
int err = 0;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
- ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
- "block %llu, max_blocks %u, flags %d, allocated %u",
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+ "block %llu, max_blocks %u, flags %x, allocated %u\n",
inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
flags, allocated);
ext4_ext_show_leaf(inode, path);
@@ -3624,7 +3627,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
ext4_lblk_t ex_cluster_start, ex_cluster_end;
- ext4_lblk_t rr_cluster_start, rr_cluster_end;
+ ext4_lblk_t rr_cluster_start;
ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
@@ -3635,7 +3638,6 @@ static int get_implied_cluster_alloc(struct super_block *sb,
/* The requested region passed into ext4_map_blocks() */
rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
- rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
if ((rr_cluster_start == ex_cluster_end) ||
(rr_cluster_start == ex_cluster_start)) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4637af036d9c..25d8c9781ad9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
fatal = ext4_journal_get_write_access(handle, bh2);
}
ext4_lock_group(sb, block_group);
- cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+ cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
if (fatal || !cleared) {
ext4_unlock_group(sb, block_group);
goto out;
@@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
- unsigned int freei, avefreei;
+ unsigned int freei, avefreei, grp_free;
ext4_fsblk_t freeb, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
@@ -477,8 +477,8 @@ fallback_retry:
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
- if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei) {
+ grp_free = ext4_free_inodes_count(sb, desc);
+ if (desc && grp_free && grp_free >= avefreei) {
*group = grp;
return 0;
}
@@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb,
*/
down_read(&grp->alloc_sem);
ext4_lock_group(sb, group);
- if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+ if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
/* not a free inode */
retval = 1;
goto err_ret;
@@ -885,8 +885,12 @@ got:
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
- err = -EINVAL;
- goto fail_drop;
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ goto fail;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index aa8efa6572d6..feaa82fe629d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -71,6 +71,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+ struct inode *inode, struct page *page, loff_t from,
+ loff_t length, int flags);
/*
* Test whether an inode is a fast symlink.
@@ -2759,7 +2762,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
if (!io_end || !size)
goto out;
- ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
@@ -3160,7 +3163,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
*
* Returns zero on sucess or negative on failure.
*/
-int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
struct inode *inode, struct page *page, loff_t from,
loff_t length, int flags)
{
@@ -3300,126 +3303,6 @@ next:
return err;
}
-/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
-{
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned length;
- unsigned blocksize;
- struct inode *inode = mapping->host;
-
- blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
-
- return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'. The range to be zero'd must
- * be contained with in one block. If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length)
-{
- ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, max, pos;
- ext4_lblk_t iblock;
- struct inode *inode = mapping->host;
- struct buffer_head *bh;
- struct page *page;
- int err = 0;
-
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (!page)
- return -ENOMEM;
-
- blocksize = inode->i_sb->s_blocksize;
- max = blocksize - (offset & (blocksize - 1));
-
- /*
- * correct length if it does not fall between
- * 'from' and the end of the block
- */
- if (length > max || length < 0)
- length = max;
-
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
-
- /* Find the buffer that contains "offset" */
- bh = page_buffers(page);
- pos = blocksize;
- while (offset >= pos) {
- bh = bh->b_this_page;
- iblock++;
- pos += blocksize;
- }
-
- err = 0;
- if (buffer_freed(bh)) {
- BUFFER_TRACE(bh, "freed: skip");
- goto unlock;
- }
-
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "unmapped");
- ext4_get_block(inode, iblock, bh, 0);
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "still unmapped");
- goto unlock;
- }
- }
-
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
-
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
- goto unlock;
- }
-
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err)
- goto unlock;
- }
-
- zero_user(page, offset, length);
-
- BUFFER_TRACE(bh, "zeroed end of block");
-
- err = 0;
- if (ext4_should_journal_data(inode)) {
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- } else
- mark_buffer_dirty(bh);
-
-unlock:
- unlock_page(page);
- page_cache_release(page);
- return err;
-}
-
int ext4_can_truncate(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
@@ -4646,9 +4529,19 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return 0;
if (is_journal_aborted(journal))
return -EROFS;
+ /* We have to allocate physical blocks for delalloc blocks
+ * before flushing journal. otherwise delalloc blocks can not
+ * be allocated any more. even more truncate on delalloc blocks
+ * could trigger BUG by flushing delalloc blocks in journal.
+ * There is no delalloc block in non-journal data mode.
+ */
+ if (val && test_opt(inode->i_sb, DELALLOC)) {
+ err = ext4_alloc_da_blocks(inode);
+ if (err < 0)
+ return err;
+ }
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
/*
* OK, there are no updates running now, and all cached data is
@@ -4660,8 +4553,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
- else
+ else {
+ jbd2_journal_flush(journal);
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ }
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e87a932b073b..6eee25591b81 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -18,6 +18,8 @@
#include "ext4_jbd2.h"
#include "ext4.h"
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = filp->f_dentry->d_inode;
@@ -186,19 +188,22 @@ setversion_out:
if (err)
return err;
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto group_extend_out;
}
err = mnt_want_write_file(filp);
if (err)
- return err;
+ goto group_extend_out;
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
@@ -209,8 +214,8 @@ setversion_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+group_extend_out:
ext4_resize_end(sb);
-
return err;
}
@@ -251,8 +256,7 @@ setversion_out:
err = ext4_move_extents(filp, donor_filp, me.orig_start,
me.donor_start, me.len, &me.moved_len);
mnt_drop_write_file(filp);
- if (me.moved_len > 0)
- file_remove_suid(donor_filp);
+ mnt_drop_write(filp->f_path.mnt);
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
@@ -271,19 +275,22 @@ mext_out:
return err;
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto group_add_out;
}
err = mnt_want_write_file(filp);
if (err)
- return err;
+ goto group_add_out;
err = ext4_group_add(sb, &input);
if (EXT4_SB(sb)->s_journal) {
@@ -294,8 +301,8 @@ mext_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+group_add_out:
ext4_resize_end(sb);
-
return err;
}
@@ -335,6 +342,60 @@ mext_out:
return err;
}
+ case EXT4_IOC_RESIZE_FS: {
+ ext4_fsblk_t n_blocks_count;
+ struct super_block *sb = inode->i_sb;
+ int err = 0, err2 = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_META_BG)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with meta_bg");
+ return -EOPNOTSUPP;
+ }
+
+ if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+ sizeof(__u64))) {
+ return -EFAULT;
+ }
+
+ if (n_blocks_count > MAX_32_NUM &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "File system only supports 32-bit block numbers");
+ return -EOPNOTSUPP;
+ }
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto resizefs_out;
+
+ err = ext4_resize_fs(sb, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write(filp->f_path.mnt);
+resizefs_out:
+ ext4_resize_end(sb);
+ return err;
+ }
+
case FITRIM:
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
@@ -433,6 +494,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
case EXT4_IOC_MOVE_EXT:
case FITRIM:
+ case EXT4_IOC_RESIZE_FS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2d8be8f28bf..cb990b21c698 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ext4_group_t group;
ext4_grpblk_t bit;
- trace_ext4_mb_release_group_pa(pa);
+ trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 996780ab4f4e..f9d948f0eb86 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -134,6 +134,172 @@ static int verify_group_input(struct super_block *sb,
return err;
}
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+ struct ext4_new_group_data *groups; /* new_group_data for groups
+ in the flex group */
+ __u16 *bg_flags; /* block group flags of groups
+ in @groups */
+ ext4_group_t count; /* number of groups in @groups
+ */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+ struct ext4_new_flex_group_data *flex_gd;
+
+ flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+ if (flex_gd == NULL)
+ goto out3;
+
+ flex_gd->count = flexbg_size;
+
+ flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+ flexbg_size, GFP_NOFS);
+ if (flex_gd->groups == NULL)
+ goto out2;
+
+ flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+ if (flex_gd->bg_flags == NULL)
+ goto out1;
+
+ return flex_gd;
+
+out1:
+ kfree(flex_gd->groups);
+out2:
+ kfree(flex_gd);
+out3:
+ return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+ kfree(flex_gd->bg_flags);
+ kfree(flex_gd->groups);
+ kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize. Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ */
+static void ext4_alloc_group_tables(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ int flexbg_size)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ ext4_fsblk_t start_blk;
+ ext4_fsblk_t last_blk;
+ ext4_group_t src_group;
+ ext4_group_t bb_index = 0;
+ ext4_group_t ib_index = 0;
+ ext4_group_t it_index = 0;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ unsigned overhead;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+ src_group = group_data[0].group;
+ last_group = src_group + flex_gd->count - 1;
+
+ BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+ (last_group & ~(flexbg_size - 1))));
+next_group:
+ group = group_data[0].group;
+ start_blk = ext4_group_first_block_no(sb, src_group);
+ last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+ overhead = ext4_bg_has_super(sb, src_group) ?
+ (1 + ext4_bg_num_gdb(sb, src_group) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+
+ start_blk += overhead;
+
+ BUG_ON(src_group >= group_data[0].group + flex_gd->count);
+ /* We collect contiguous blocks as much as possible. */
+ src_group++;
+ for (; src_group <= last_group; src_group++)
+ if (!ext4_bg_has_super(sb, src_group))
+ last_blk += group_data[src_group - group].blocks_count;
+ else
+ break;
+
+ /* Allocate block bitmaps */
+ for (; bb_index < flex_gd->count; bb_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[bb_index].block_bitmap = start_blk++;
+ ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ }
+
+ /* Allocate inode bitmaps */
+ for (; ib_index < flex_gd->count; ib_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[ib_index].inode_bitmap = start_blk++;
+ ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ }
+
+ /* Allocate inode tables */
+ for (; it_index < flex_gd->count; it_index++) {
+ if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+ goto next_group;
+ group_data[it_index].inode_table = start_blk;
+ ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count -=
+ EXT4_SB(sb)->s_itb_per_group;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ start_blk += EXT4_SB(sb)->s_itb_per_group;
+ }
+
+ if (test_opt(sb, DEBUG)) {
+ int i;
+ group = group_data[0].group;
+
+ printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+ "%d groups, flexbg size is %d:\n", flex_gd->count,
+ flexbg_size);
+
+ for (i = 0; i < flex_gd->count; i++) {
+ printk(KERN_DEBUG "adding %s group %u: %u "
+ "blocks (%d free)\n",
+ ext4_bg_has_super(sb, group + i) ? "normal" :
+ "no-super", group + i,
+ group_data[i].blocks_count,
+ group_data[i].free_blocks_count);
+ }
+ }
+}
+
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
ext4_fsblk_t blk)
{
@@ -179,131 +345,250 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
}
/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t block, ext4_group_t count)
+{
+ ext4_group_t count2;
+
+ ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+ for (count2 = count; count > 0; count -= count2, block += count2) {
+ ext4_fsblk_t start;
+ struct buffer_head *bh;
+ ext4_group_t group;
+ int err;
+
+ ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ start = ext4_group_first_block_no(sb, group);
+ group -= flex_gd->groups[0].group;
+
+ count2 = sb->s_blocksize * 8 - (block - start);
+ if (count2 > count)
+ count2 = count;
+
+ if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+ BUG_ON(flex_gd->count > 1);
+ continue;
+ }
+
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ return err;
+
+ bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+ if (!bh)
+ return -EIO;
+
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ return err;
+ ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+ block - start, count2);
+ ext4_set_bits(bh->b_data, block - start, count2);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
+ return err;
+ brelse(bh);
+ }
+
+ return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
* This doesn't need to be part of the main transaction, since we are only
* changing blocks outside the actual filesystem. We still do journaling to
* ensure the recovery is correct in case of a failure just after resize.
* If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ * 1. copy super block and GDT, and initialize group tables if necessary.
+ * In this step, we only set bits in blocks bitmaps for blocks taken by
+ * super block and GDT.
+ * 2. allocate group tables in block bitmaps, that is, set bits in block
+ * bitmap for blocks taken by group tables.
*/
-static int setup_new_group_blocks(struct super_block *sb,
- struct ext4_new_group_data *input)
+static int setup_new_flex_group_blocks(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
{
+ int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+ ext4_fsblk_t start;
+ ext4_fsblk_t block;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
- int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
- unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
- struct buffer_head *bh;
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ __u16 *bg_flags = flex_gd->bg_flags;
handle_t *handle;
- ext4_fsblk_t block;
- ext4_grpblk_t bit;
- int i;
- int err = 0, err2;
+ ext4_group_t group, count;
+ struct buffer_head *bh = NULL;
+ int reserved_gdb, i, j, err = 0, err2;
+
+ BUG_ON(!flex_gd->count || !group_data ||
+ group_data[0].group != sbi->s_groups_count);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
/* This transaction may be extended/restarted along the way */
handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
-
if (IS_ERR(handle))
return PTR_ERR(handle);
- BUG_ON(input->group != sbi->s_groups_count);
+ group = group_data[0].group;
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ unsigned long gdblocks;
- /* Copy all of the GDT blocks into the backup in this group */
- for (i = 0, bit = 1, block = start + 1;
- i < gdblocks; i++, block++, bit++) {
- struct buffer_head *gdb;
+ gdblocks = ext4_bg_num_gdb(sb, group);
+ start = ext4_group_first_block_no(sb, group);
- ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
- err = extend_or_restart_transaction(handle, 1);
- if (err)
- goto exit_journal;
+ /* Copy all of the GDT blocks into the backup in this group */
+ for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+ struct buffer_head *gdb;
- gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
- goto exit_journal;
- }
- if ((err = ext4_journal_get_write_access(handle, gdb))) {
+ ext4_debug("update backup group %#04llx\n", block);
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ gdb = sb_getblk(sb, block);
+ if (!gdb) {
+ err = -EIO;
+ goto out;
+ }
+
+ err = ext4_journal_get_write_access(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto out;
+ }
+ memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+ gdb->b_size);
+ set_buffer_uptodate(gdb);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+ if (unlikely(err)) {
+ brelse(gdb);
+ goto out;
+ }
brelse(gdb);
- goto exit_journal;
}
- memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
- set_buffer_uptodate(gdb);
- err = ext4_handle_dirty_metadata(handle, NULL, gdb);
- if (unlikely(err)) {
- brelse(gdb);
- goto exit_journal;
+
+ /* Zero out all of the reserved backup group descriptor
+ * table blocks
+ */
+ if (ext4_bg_has_super(sb, group)) {
+ err = sb_issue_zeroout(sb, gdblocks + start + 1,
+ reserved_gdb, GFP_NOFS);
+ if (err)
+ goto out;
}
- brelse(gdb);
- }
- /* Zero out all of the reserved backup group descriptor table blocks */
- ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
- block, sbi->s_itb_per_group);
- err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
- GFP_NOFS);
- if (err)
- goto exit_journal;
+ /* Initialize group tables of the grop @group */
+ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+ goto handle_bb;
- err = extend_or_restart_transaction(handle, 2);
- if (err)
- goto exit_journal;
+ /* Zero out all of the inode table blocks */
+ block = group_data[i].inode_table;
+ ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+ block, sbi->s_itb_per_group);
+ err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+ GFP_NOFS);
+ if (err)
+ goto out;
- bh = bclean(handle, sb, input->block_bitmap);
- if (IS_ERR(bh)) {
- err = PTR_ERR(bh);
- goto exit_journal;
- }
+handle_bb:
+ if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+ goto handle_ib;
- if (ext4_bg_has_super(sb, input->group)) {
- ext4_debug("mark backup group tables %#04llx (+0)\n", start);
- ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
- }
+ /* Initialize block bitmap of the @group */
+ block = group_data[i].block_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
- ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
- input->block_bitmap - start);
- ext4_set_bit(input->block_bitmap - start, bh->b_data);
- ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
- input->inode_bitmap - start);
- ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
- /* Zero out all of the inode table blocks */
- block = input->inode_table;
- ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
- block, sbi->s_itb_per_group);
- err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
- if (err)
- goto exit_bh;
- ext4_set_bits(bh->b_data, input->inode_table - start,
- sbi->s_itb_per_group);
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
+ if (ext4_bg_has_super(sb, group)) {
+ ext4_debug("mark backup superblock %#04llx (+0)\n",
+ start);
+ ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
+ 1);
+ }
+ ext4_mark_bitmap_end(group_data[i].blocks_count,
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
+handle_ib:
+ if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+ continue;
- ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
- bh->b_data);
- err = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (unlikely(err)) {
- ext4_std_error(sb, err);
- goto exit_bh;
+ /* Initialize inode bitmap of the @group */
+ block = group_data[i].inode_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+ /* Mark unused entries in inode bitmap used */
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
+
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
}
- brelse(bh);
- /* Mark unused entries in inode bitmap used */
- ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
- input->inode_bitmap, input->inode_bitmap - start);
- if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
+ bh = NULL;
+
+ /* Mark group tables in block bitmap */
+ for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+ count = group_table_count[j];
+ start = (&group_data[0].block_bitmap)[j];
+ block = start;
+ for (i = 1; i < flex_gd->count; i++) {
+ block += group_table_count[j];
+ if (block == (&group_data[i].block_bitmap)[j]) {
+ count += group_table_count[j];
+ continue;
+ }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ count = group_table_count[j];
+ start = group_data[i].block_bitmap;
+ block = start;
+ }
+
+ if (count) {
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ }
}
- ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
- bh->b_data);
- err = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (unlikely(err))
- ext4_std_error(sb, err);
-exit_bh:
+out:
brelse(bh);
-
-exit_journal:
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
err = err2;
return err;
@@ -351,10 +636,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
* groups in current filesystem that have BACKUPS, or -ve error code.
*/
static int verify_reserved_gdb(struct super_block *sb,
+ ext4_group_t end,
struct buffer_head *primary)
{
const ext4_fsblk_t blk = primary->b_blocknr;
- const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
@@ -429,7 +714,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
if (!gdb_bh)
return -EIO;
- gdbackups = verify_reserved_gdb(sb, gdb_bh);
+ gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
if (gdbackups < 0) {
err = gdbackups;
goto exit_bh;
@@ -592,7 +877,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
err = -EIO;
goto exit_bh;
}
- if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+ gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+ if (gdbackups < 0) {
brelse(primary[res]);
err = gdbackups;
goto exit_bh;
@@ -735,6 +1021,348 @@ exit_err:
}
}
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+ ext4_group_t group, struct inode *resize_inode,
+ ext4_group_t count)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *gdb_bh;
+ int i, gdb_off, gdb_num, err = 0;
+
+ for (i = 0; i < count; i++, group++) {
+ int reserved_gdb = ext4_bg_has_super(sb, group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * We will only either add reserved group blocks to a backup group
+ * or remove reserved blocks for the first group in a new group block.
+ * Doing both would be mean more complex code, and sane people don't
+ * use non-sparse filesystems anymore. This is already checked above.
+ */
+ if (gdb_off) {
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+
+ if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+ err = reserve_backup_gdb(handle, resize_inode, group);
+ } else
+ err = add_new_gdb(handle, resize_inode, group);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_group_desc *gdp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *gdb_bh;
+ ext4_group_t group;
+ __u16 *bg_flags = flex_gd->bg_flags;
+ int i, gdb_off, gdb_num, err = 0;
+
+
+ for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+ group = group_data->group;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+ */
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ /* Update group descriptor block for new group */
+ gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+ gdb_off * EXT4_DESC_SIZE(sb));
+
+ memset(gdp, 0, EXT4_DESC_SIZE(sb));
+ ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+ ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+ ext4_inode_table_set(sb, gdp, group_data->inode_table);
+ ext4_free_group_clusters_set(sb, gdp,
+ EXT4_B2C(sbi, group_data->free_blocks_count));
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ gdp->bg_flags = cpu_to_le16(*bg_flags);
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ break;
+ }
+
+ /*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ err = ext4_mb_add_groupinfo(sb, group, gdp);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ ext4_fsblk_t blocks_count = 0;
+ ext4_fsblk_t free_blocks = 0;
+ ext4_fsblk_t reserved_blocks = 0;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int i;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+ /*
+ * Make the new blocks and inodes valid next. We do this before
+ * increasing the group count so that once the group is enabled,
+ * all of its blocks and inodes are already valid.
+ *
+ * We always allocate group-by-group, then block-by-block or
+ * inode-by-inode within a group, so enabling these
+ * blocks/inodes before the group is live won't actually let us
+ * allocate the new space yet.
+ */
+ for (i = 0; i < flex_gd->count; i++) {
+ blocks_count += group_data[i].blocks_count;
+ free_blocks += group_data[i].free_blocks_count;
+ }
+
+ reserved_blocks = ext4_r_blocks_count(es) * 100;
+ do_div(reserved_blocks, ext4_blocks_count(es));
+ reserved_blocks *= blocks_count;
+ do_div(reserved_blocks, 100);
+
+ ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+ le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+
+ /*
+ * We need to protect s_groups_count against other CPUs seeing
+ * inconsistent state in the superblock.
+ *
+ * The precise rules we use are:
+ *
+ * * Writers must perform a smp_wmb() after updating all
+ * dependent data and before modifying the groups count
+ *
+ * * Readers must perform an smp_rmb() after reading the groups
+ * count and before reading any dependent data.
+ *
+ * NB. These rules can be relaxed when checking the group count
+ * while freeing data, as we can only allocate from a block
+ * group after serialising against the group count, and we can
+ * only then free after serialising in turn against that
+ * allocation.
+ */
+ smp_wmb();
+
+ /* Update the global fs size fields */
+ sbi->s_groups_count += flex_gd->count;
+
+ /* Update the reserved block counts only once the new group is
+ * active. */
+ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+ reserved_blocks);
+
+ /* Update the free space counts */
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_B2C(sbi, free_blocks));
+ percpu_counter_add(&sbi->s_freeinodes_counter,
+ EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group;
+ flex_group = ext4_flex_group(sbi, group_data[0].group);
+ atomic_add(EXT4_B2C(sbi, free_blocks),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ }
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: added group %u:"
+ "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+ blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+ struct inode *resize_inode,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t o_blocks_count;
+ ext4_grpblk_t last;
+ ext4_group_t group;
+ handle_t *handle;
+ unsigned reserved_gdb;
+ int err = 0, err2 = 0, credit;
+
+ BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ o_blocks_count = ext4_blocks_count(es);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+
+ err = setup_new_flex_group_blocks(sb, flex_gd);
+ if (err)
+ goto exit;
+ /*
+ * We will always be modifying at least the superblock and GDT
+ * block. If we are adding a group past the last current GDT block,
+ * we will also modify the inode and the dindirect block. If we
+ * are adding a group with superblock/GDT backups we will also
+ * modify each of the reserved GDT dindirect blocks.
+ */
+ credit = flex_gd->count * 4 + reserved_gdb;
+ handle = ext4_journal_start_sb(sb, credit);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto exit;
+ }
+
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto exit_journal;
+
+ group = flex_gd->groups[0].group;
+ BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+ err = ext4_add_new_descs(handle, sb, group,
+ resize_inode, flex_gd->count);
+ if (err)
+ goto exit_journal;
+
+ err = ext4_setup_new_descs(handle, sb, flex_gd);
+ if (err)
+ goto exit_journal;
+
+ ext4_update_super(sb, flex_gd);
+
+ err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+ err2 = ext4_journal_stop(handle);
+ if (!err)
+ err = err2;
+
+ if (!err) {
+ int i;
+ update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+ sizeof(struct ext4_super_block));
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ struct buffer_head *gdb_bh;
+ int gdb_num;
+ gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+ gdb_bh->b_size);
+ }
+ }
+exit:
+ return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t n_blocks_count,
+ unsigned long flexbg_size)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t n_group;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ ext4_grpblk_t last;
+ ext4_grpblk_t blocks_per_group;
+ unsigned long i;
+
+ blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (o_blocks_count == n_blocks_count)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+ last_group = group | (flexbg_size - 1);
+ if (last_group > n_group)
+ last_group = n_group;
+
+ flex_gd->count = last_group - group + 1;
+
+ for (i = 0; i < flex_gd->count; i++) {
+ int overhead;
+
+ group_data[i].group = group + i;
+ group_data[i].blocks_count = blocks_per_group;
+ overhead = ext4_bg_has_super(sb, group + i) ?
+ (1 + ext4_bg_num_gdb(sb, group + i) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ group_data[i].free_blocks_count = blocks_per_group - overhead;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+ EXT4_BG_INODE_UNINIT;
+ else
+ flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+ }
+
+ if (last_group == n_group &&
+ EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ /* We need to initialize block bitmap of last group. */
+ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+ group_data[i - 1].blocks_count = last + 1;
+ group_data[i - 1].free_blocks_count -= blocks_per_group-
+ last - 1;
+ }
+
+ return 1;
+}
+
/* Add group descriptor data to an existing or new group descriptor block.
* Ensure we handle all possible error conditions _before_ we start modifying
* the filesystem, because we cannot abort the transaction and not have it
@@ -750,16 +1378,15 @@ exit_err:
*/
int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
{
+ struct ext4_new_flex_group_data flex_gd;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
- struct buffer_head *primary = NULL;
- struct ext4_group_desc *gdp;
struct inode *inode = NULL;
- handle_t *handle;
int gdb_off, gdb_num;
- int err, err2;
+ int err;
+ __u16 bg_flags = 0;
gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
@@ -798,175 +1425,69 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
- if ((err = verify_group_input(sb, input)))
- goto exit_put;
+ err = verify_group_input(sb, input);
+ if (err)
+ goto out;
- if ((err = setup_new_group_blocks(sb, input)))
- goto exit_put;
+ flex_gd.count = 1;
+ flex_gd.groups = input;
+ flex_gd.bg_flags = &bg_flags;
+ err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
+ iput(inode);
+ return err;
+} /* ext4_group_add */
- /*
- * We will always be modifying at least the superblock and a GDT
- * block. If we are adding a group past the last current GDT block,
- * we will also modify the inode and the dindirect block. If we
- * are adding a group with superblock/GDT backups we will also
- * modify each of the reserved GDT dindirect blocks.
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+ ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ handle_t *handle;
+ int err = 0, err2;
+
+ /* We will update the superblock, one block bitmap, and
+ * one group descriptor via ext4_group_add_blocks().
*/
- handle = ext4_journal_start_sb(sb,
- ext4_bg_has_super(sb, input->group) ?
- 3 + reserved_gdb : 4);
+ handle = ext4_journal_start_sb(sb, 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto exit_put;
+ ext4_warning(sb, "error %d on journal start", err);
+ return err;
}
- if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
- goto exit_journal;
-
- /*
- * We will only either add reserved group blocks to a backup group
- * or remove reserved blocks for the first group in a new group block.
- * Doing both would be mean more complex code, and sane people don't
- * use non-sparse filesystems anymore. This is already checked above.
- */
- if (gdb_off) {
- primary = sbi->s_group_desc[gdb_num];
- if ((err = ext4_journal_get_write_access(handle, primary)))
- goto exit_journal;
-
- if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
- err = reserve_backup_gdb(handle, inode, input->group);
- if (err)
- goto exit_journal;
- }
- } else {
- /*
- * Note that we can access new group descriptor block safely
- * only if add_new_gdb() succeeds.
- */
- err = add_new_gdb(handle, inode, input->group);
- if (err)
- goto exit_journal;
- primary = sbi->s_group_desc[gdb_num];
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err) {
+ ext4_warning(sb, "error %d on journal write access", err);
+ goto errout;
}
- /*
- * OK, now we've set up the new group. Time to make it active.
- *
- * so we have to be safe wrt. concurrent accesses the group
- * data. So we need to be careful to set all of the relevant
- * group descriptor data etc. *before* we enable the group.
- *
- * The key field here is sbi->s_groups_count: as long as
- * that retains its old value, nobody is going to access the new
- * group.
- *
- * So first we update all the descriptor metadata for the new
- * group; then we update the total disk blocks count; then we
- * update the groups count to enable the group; then finally we
- * update the free space counts so that the system can start
- * using the new disk blocks.
- */
-
- /* Update group descriptor block for new group */
- gdp = (struct ext4_group_desc *)((char *)primary->b_data +
- gdb_off * EXT4_DESC_SIZE(sb));
-
- memset(gdp, 0, EXT4_DESC_SIZE(sb));
- ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
- ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
- ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
- ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
- gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
-
- /*
- * We can allocate memory for mb_alloc based on the new group
- * descriptor
- */
- err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+ ext4_blocks_count_set(es, o_blocks_count + add);
+ ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+ /* We add the blocks to the bitmap and set the group need init bit */
+ err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
if (err)
- goto exit_journal;
-
- /*
- * Make the new blocks and inodes valid next. We do this before
- * increasing the group count so that once the group is enabled,
- * all of its blocks and inodes are already valid.
- *
- * We always allocate group-by-group, then block-by-block or
- * inode-by-inode within a group, so enabling these
- * blocks/inodes before the group is live won't actually let us
- * allocate the new space yet.
- */
- ext4_blocks_count_set(es, ext4_blocks_count(es) +
- input->blocks_count);
- le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
-
- /*
- * We need to protect s_groups_count against other CPUs seeing
- * inconsistent state in the superblock.
- *
- * The precise rules we use are:
- *
- * * Writers must perform a smp_wmb() after updating all dependent
- * data and before modifying the groups count
- *
- * * Readers must perform an smp_rmb() after reading the groups count
- * and before reading any dependent data.
- *
- * NB. These rules can be relaxed when checking the group count
- * while freeing data, as we can only allocate from a block
- * group after serialising against the group count, and we can
- * only then free after serialising in turn against that
- * allocation.
- */
- smp_wmb();
-
- /* Update the global fs size fields */
- sbi->s_groups_count++;
-
- err = ext4_handle_dirty_metadata(handle, NULL, primary);
- if (unlikely(err)) {
- ext4_std_error(sb, err);
- goto exit_journal;
- }
-
- /* Update the reserved block counts only once the new group is
- * active. */
- ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
- input->reserved_blocks);
-
- /* Update the free space counts */
- percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_B2C(sbi, input->free_blocks_count));
- percpu_counter_add(&sbi->s_freeinodes_counter,
- EXT4_INODES_PER_GROUP(sb));
-
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group;
- flex_group = ext4_flex_group(sbi, input->group);
- atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
- &sbi->s_flex_groups[flex_group].free_clusters);
- atomic_add(EXT4_INODES_PER_GROUP(sb),
- &sbi->s_flex_groups[flex_group].free_inodes);
- }
-
+ goto errout;
ext4_handle_dirty_super(handle, sb);
-
-exit_journal:
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+errout:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
err = err2;
- if (!err && primary) {
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+
+ if (!err) {
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+ "blocks\n", ext4_blocks_count(es));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
sizeof(struct ext4_super_block));
- update_backups(sb, primary->b_blocknr, primary->b_data,
- primary->b_size);
}
-exit_put:
- iput(inode);
return err;
-} /* ext4_group_add */
+}
/*
* Extend the filesystem to the new number of blocks specified. This entry
@@ -985,8 +1506,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_grpblk_t last;
ext4_grpblk_t add;
struct buffer_head *bh;
- handle_t *handle;
- int err, err2;
+ int err;
ext4_group_t group;
o_blocks_count = ext4_blocks_count(es);
@@ -1042,42 +1562,119 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
brelse(bh);
- /* We will update the superblock, one block bitmap, and
- * one group descriptor via ext4_free_blocks().
- */
- handle = ext4_journal_start_sb(sb, 3);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- ext4_warning(sb, "error %d on journal start", err);
- goto exit_put;
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ return err;
+} /* ext4_group_extend */
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+ struct ext4_new_flex_group_data *flex_gd = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *bh;
+ struct inode *resize_inode;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t o_group;
+ ext4_group_t n_group;
+ ext4_grpblk_t offset;
+ unsigned long n_desc_blocks;
+ unsigned long o_desc_blocks;
+ unsigned long desc_blocks;
+ int err = 0, flexbg_size = 1;
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+ "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+
+ if (n_blocks_count < o_blocks_count) {
+ /* On-line shrinking not supported */
+ ext4_warning(sb, "can't shrink FS - resize aborted");
+ return -EINVAL;
}
- if ((err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh))) {
- ext4_warning(sb, "error %d on journal write access", err);
- ext4_journal_stop(handle);
- goto exit_put;
+ if (n_blocks_count == o_blocks_count)
+ /* Nothing need to do */
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+
+ n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
+ EXT4_DESC_PER_BLOCK(sb);
+ o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ desc_blocks = n_desc_blocks - o_desc_blocks;
+
+ if (desc_blocks &&
+ (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
+ le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
+ ext4_warning(sb, "No reserved GDT blocks, can't resize");
+ return -EPERM;
}
- ext4_blocks_count_set(es, o_blocks_count + add);
- ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- /* We add the blocks to the bitmap and set the group need init bit */
- err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
- ext4_handle_dirty_super(handle, sb);
- ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- err2 = ext4_journal_stop(handle);
- if (!err && err2)
- err = err2;
- if (err)
- goto exit_put;
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(resize_inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(resize_inode);
+ }
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, n_blocks_count - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+ if (offset != 0) {
+ /* extend the last group */
+ ext4_grpblk_t add;
+ add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ if (err)
+ goto out;
+ }
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ es->s_log_groups_per_flex)
+ flexbg_size = 1 << es->s_log_groups_per_flex;
+
+ o_blocks_count = ext4_blocks_count(es);
+ if (o_blocks_count == n_blocks_count)
+ goto out;
+
+ flex_gd = alloc_flex_gd(flexbg_size);
+ if (flex_gd == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Add flex groups. Note that a regular group is a
+ * flex group with 1 group.
+ */
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+ flexbg_size)) {
+ ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+ err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+ if (unlikely(err))
+ break;
+ }
+
+out:
+ if (flex_gd)
+ free_flex_gd(flex_gd);
+
+ iput(resize_inode);
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
- ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
-exit_put:
+ printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
+ "upto %llu blocks\n", o_blocks_count, n_blocks_count);
return err;
-} /* ext4_group_extend */
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ed3ce82e2de4..502c61fd7392 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1095,7 +1095,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root)
}
if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
seq_printf(seq, ",max_batch_time=%u",
- (unsigned) sbi->s_min_batch_time);
+ (unsigned) sbi->s_max_batch_time);
}
/*
@@ -2005,17 +2005,16 @@ static int ext4_fill_flex_info(struct super_block *sb)
struct ext4_group_desc *gdp = NULL;
ext4_group_t flex_group_count;
ext4_group_t flex_group;
- int groups_per_flex = 0;
+ unsigned int groups_per_flex = 0;
size_t size;
int i;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
- if (groups_per_flex < 2) {
+ if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
@@ -3506,7 +3505,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* of the filesystem.
*/
if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
"block %u is beyond end of filesystem (%llu)",
le32_to_cpu(es->s_first_data_block),
ext4_blocks_count(es));
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index b60f9f81e33c..d2a200624af5 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -47,8 +47,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
handle_t *handle = fs_info;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e2951506434d..f855916657ba 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/writeback.h>
@@ -29,6 +30,11 @@
#include "internal.h"
/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
+
+/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
@@ -742,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb,
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
+ /*
+ * Kupdate and background works are special and we want to
+ * include all inodes that need writing. Livelock avoidance is
+ * handled by these works yielding to any other work so we are
+ * safe.
+ */
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
- work->older_than_this = &oldest_jif;
- }
+ } else if (work->for_background)
+ oldest_jif = jiffies;
trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2aaf3eaaf13d..5f3368ab0fa9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ up_read(&fc->killsb);
+ kfree(buf);
+ return err;
+
+err:
+ kfree(buf);
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_delete_out outarg;
+ int err = -ENOMEM;
+ char *buf;
+ struct qstr name;
+
+ buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ err = -ENAMETOOLONG;
+ if (outarg.namelen > FUSE_NAME_MAX)
+ goto err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg) + outarg.namelen + 1)
+ goto err;
+
+ name.name = buf;
+ name.len = outarg.namelen;
+ err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+ buf[outarg.namelen] = 0;
+ name.hash = full_name_hash(name.name, name.len);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ if (fc->sb)
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+ outarg.child, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_RETRIEVE:
return fuse_notify_retrieve(fc, size, cs);
+ case FUSE_NOTIFY_DELETE:
+ return fuse_notify_delete(fc, size, cs);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5ddd6ea8f839..206632887bb4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
}
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
- struct qstr *name)
+ u64 child_nodeid, struct qstr *name)
{
int err = -ENOTDIR;
struct inode *parent;
@@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
fuse_invalidate_attr(parent);
fuse_invalidate_entry(entry);
+
+ if (child_nodeid != 0 && entry->d_inode) {
+ mutex_lock(&entry->d_inode->i_mutex);
+ if (get_node_id(entry->d_inode) != child_nodeid) {
+ err = -ENOENT;
+ goto badentry;
+ }
+ if (d_mountpoint(entry)) {
+ err = -EBUSY;
+ goto badentry;
+ }
+ if (S_ISDIR(entry->d_inode->i_mode)) {
+ shrink_dcache_parent(entry);
+ if (!simple_empty(entry)) {
+ err = -ENOTEMPTY;
+ goto badentry;
+ }
+ entry->d_inode->i_flags |= S_DEAD;
+ }
+ dont_mount(entry);
+ clear_nlink(entry->d_inode);
+ err = 0;
+ badentry:
+ mutex_unlock(&entry->d_inode->i_mutex);
+ if (!err)
+ d_delete(entry);
+ } else {
+ err = 0;
+ }
dput(entry);
- err = 0;
unlock:
mutex_unlock(&parent->i_mutex);
@@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
return fuse_fsync_common(file, start, end, datasync, 1);
}
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg,
+ FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
static bool update_mtime(unsigned ivalid)
{
/* Always update if mtime is explicitly set */
@@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = {
.open = fuse_dir_open,
.release = fuse_dir_release,
.fsync = fuse_dir_fsync,
+ .unlocked_ioctl = fuse_dir_ioctl,
+ .compat_ioctl = fuse_dir_compat_ioctl,
};
static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0c84100acd44..4a199fd93fbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
loff_t retval;
struct inode *inode = file->f_path.dentry->d_inode;
- mutex_lock(&inode->i_mutex);
- if (origin != SEEK_CUR && origin != SEEK_SET) {
- retval = fuse_update_attributes(inode, NULL, file, NULL);
- if (retval)
- goto exit;
- }
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+ if (origin == SEEK_CUR || origin == SEEK_SET)
+ return generic_file_llseek(file, offset, origin);
- switch (origin) {
- case SEEK_END:
- offset += i_size_read(inode);
- break;
- case SEEK_CUR:
- if (offset == 0) {
- retval = file->f_pos;
- goto exit;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset >= i_size_read(inode)) {
- retval = -ENXIO;
- goto exit;
- }
- break;
- case SEEK_HOLE:
- if (offset >= i_size_read(inode)) {
- retval = -ENXIO;
- goto exit;
- }
- offset = i_size_read(inode);
- break;
- }
- retval = -EINVAL;
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- retval = offset;
- }
-exit:
+ mutex_lock(&inode->i_mutex);
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, origin);
mutex_unlock(&inode->i_mutex);
+
return retval;
}
@@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+ pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!pages || !iov_page)
goto out;
@@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
}
EXPORT_SYMBOL_GPL(fuse_do_ioctl);
-static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int flags)
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
{
struct inode *inode = file->f_dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
static long fuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_ioctl_common(file, cmd, arg, 0);
+ return fuse_ioctl_common(file, cmd, arg, 0);
}
static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
}
/*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1964da0257d9..572cefc78012 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
/**
* File-system tells the kernel to invalidate parent attributes and
* the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ * - matches the inode number for the dentry matching parent/name,
+ * - is not a mount point
+ * - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
*/
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
- struct qstr *name);
+ u64 child_nodeid, struct qstr *name);
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
@@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
size_t count, loff_t *ppos, int write);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags);
unsigned fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 88e8a23d0026..376816fcd040 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
- if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+ if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2553b858a72e..307ac31df781 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -121,8 +121,11 @@ enum {
struct lm_lockops {
const char *lm_proto_name;
- int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
- void (*lm_unmount) (struct gfs2_sbd *sdp);
+ int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
+ void (*lm_first_done) (struct gfs2_sbd *sdp);
+ void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int result);
+ void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
void (*lm_put_lock) (struct gfs2_glock *gl);
int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e1d3bb59945c..97742a7ea9cc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -139,8 +139,45 @@ struct gfs2_bufdata {
#define GDLM_STRNAME_BYTES 25
#define GDLM_LVB_SIZE 32
+/*
+ * ls_recover_flags:
+ *
+ * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
+ * held by failed nodes whose journals need recovery. Those locks should
+ * only be used for journal recovery until the journal recovery is done.
+ * This is set by the dlm recover_prep callback and cleared by the
+ * gfs2_control thread when journal recovery is complete. To avoid
+ * races between recover_prep setting and gfs2_control clearing, recover_spin
+ * is held while changing this bit and reading/writing recover_block
+ * and recover_start.
+ *
+ * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
+ *
+ * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
+ * recovery of all journals before allowing other nodes to mount the fs.
+ * This is cleared when FIRST_MOUNT_DONE is set.
+ *
+ * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
+ * recovery of all journals, and now allows other nodes to mount the fs.
+ *
+ * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
+ * BLOCK_LOCKS for the first time. The gfs2_control thread should now
+ * control clearing BLOCK_LOCKS for further recoveries.
+ *
+ * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
+ *
+ * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
+ * and recover_done(), i.e. set while recover_block == recover_start.
+ */
+
enum {
DFL_BLOCK_LOCKS = 0,
+ DFL_NO_DLM_OPS = 1,
+ DFL_FIRST_MOUNT = 2,
+ DFL_FIRST_MOUNT_DONE = 3,
+ DFL_MOUNT_DONE = 4,
+ DFL_UNMOUNT = 5,
+ DFL_DLM_RECOVERY = 6,
};
struct lm_lockname {
@@ -392,6 +429,7 @@ struct gfs2_jdesc {
#define JDF_RECOVERY 1
unsigned int jd_jid;
unsigned int jd_blocks;
+ int jd_recover_error;
};
struct gfs2_statfs_change_host {
@@ -461,6 +499,7 @@ enum {
SDF_NORECOVERY = 4,
SDF_DEMOTE = 5,
SDF_NOJOURNALID = 6,
+ SDF_RORECOVERY = 7, /* read only recovery */
};
#define GFS2_FSNAME_LEN 256
@@ -499,14 +538,26 @@ struct gfs2_sb_host {
struct lm_lockstruct {
int ls_jid;
unsigned int ls_first;
- unsigned int ls_first_done;
unsigned int ls_nodir;
const struct lm_lockops *ls_ops;
- unsigned long ls_flags;
dlm_lockspace_t *ls_dlm;
- int ls_recover_jid_done;
- int ls_recover_jid_status;
+ int ls_recover_jid_done; /* These two are deprecated, */
+ int ls_recover_jid_status; /* used previously by gfs_controld */
+
+ struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
+ struct dlm_lksb ls_control_lksb; /* control_lock */
+ char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
+ struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
+
+ spinlock_t ls_recover_spin; /* protects following fields */
+ unsigned long ls_recover_flags; /* DFL_ */
+ uint32_t ls_recover_mount; /* gen in first recover_done cb */
+ uint32_t ls_recover_start; /* gen in last recover_done cb */
+ uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
+ uint32_t ls_recover_size; /* size of recover_submit, recover_result */
+ uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
+ uint32_t *ls_recover_result; /* result of last jid recovery */
};
struct gfs2_sbd {
@@ -544,6 +595,7 @@ struct gfs2_sbd {
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
+ struct delayed_work sd_control_work;
/* Inode Stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 017960cf1d7a..a7d611b93f0f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -599,9 +599,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto fail_end_trans;
- inc_nlink(&ip->i_inode);
- if (S_ISDIR(ip->i_inode.i_mode))
- inc_nlink(&ip->i_inode);
+ set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 98c80d8c2a62..8944d1e32ab5 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
+ * Copyright 2004-2011 Red Hat, Inc.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -11,12 +11,15 @@
#include <linux/dlm.h>
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/delay.h>
#include <linux/gfs2_ondisk.h>
#include "incore.h"
#include "glock.h"
#include "util.h"
+#include "sys.h"
+extern struct workqueue_struct *gfs2_control_wq;
static void gdlm_ast(void *arg)
{
@@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl)
dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
}
-static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+/*
+ * dlm/gfs2 recovery coordination using dlm_recover callbacks
+ *
+ * 1. dlm_controld sees lockspace members change
+ * 2. dlm_controld blocks dlm-kernel locking activity
+ * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
+ * 4. dlm_controld starts and finishes its own user level recovery
+ * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
+ * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
+ * 7. dlm_recoverd does its own lock recovery
+ * 8. dlm_recoverd unblocks dlm-kernel locking activity
+ * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
+ * 10. gfs2_control updates control_lock lvb with new generation and jid bits
+ * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
+ * 12. gfs2_recover dequeues and recovers journals of failed nodes
+ * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
+ * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
+ * 15. gfs2_control unblocks normal locking when all journals are recovered
+ *
+ * - failures during recovery
+ *
+ * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
+ * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
+ * recovering for a prior failure. gfs2_control needs a way to detect
+ * this so it can leave BLOCK_LOCKS set in step 15. This is managed using
+ * the recover_block and recover_start values.
+ *
+ * recover_done() provides a new lockspace generation number each time it
+ * is called (step 9). This generation number is saved as recover_start.
+ * When recover_prep() is called, it sets BLOCK_LOCKS and sets
+ * recover_block = recover_start. So, while recover_block is equal to
+ * recover_start, BLOCK_LOCKS should remain set. (recover_spin must
+ * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
+ *
+ * - more specific gfs2 steps in sequence above
+ *
+ * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
+ * 6. recover_slot records any failed jids (maybe none)
+ * 9. recover_done sets recover_start = new generation number
+ * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
+ * 12. gfs2_recover does journal recoveries for failed jids identified above
+ * 14. gfs2_control clears control_lock lvb bits for recovered jids
+ * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
+ * again) then do nothing, otherwise if recover_start > recover_block
+ * then clear BLOCK_LOCKS.
+ *
+ * - parallel recovery steps across all nodes
+ *
+ * All nodes attempt to update the control_lock lvb with the new generation
+ * number and jid bits, but only the first to get the control_lock EX will
+ * do so; others will see that it's already done (lvb already contains new
+ * generation number.)
+ *
+ * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
+ * . All nodes attempt to set control_lock lvb gen + bits for the new gen
+ * . One node gets control_lock first and writes the lvb, others see it's done
+ * . All nodes attempt to recover jids for which they see control_lock bits set
+ * . One node succeeds for a jid, and that one clears the jid bit in the lvb
+ * . All nodes will eventually see all lvb bits clear and unblock locks
+ *
+ * - is there a problem with clearing an lvb bit that should be set
+ * and missing a journal recovery?
+ *
+ * 1. jid fails
+ * 2. lvb bit set for step 1
+ * 3. jid recovered for step 1
+ * 4. jid taken again (new mount)
+ * 5. jid fails (for step 4)
+ * 6. lvb bit set for step 5 (will already be set)
+ * 7. lvb bit cleared for step 3
+ *
+ * This is not a problem because the failure in step 5 does not
+ * require recovery, because the mount in step 4 could not have
+ * progressed far enough to unblock locks and access the fs. The
+ * control_mount() function waits for all recoveries to be complete
+ * for the latest lockspace generation before ever unblocking locks
+ * and returning. The mount in step 4 waits until the recovery in
+ * step 1 is done.
+ *
+ * - special case of first mounter: first node to mount the fs
+ *
+ * The first node to mount a gfs2 fs needs to check all the journals
+ * and recover any that need recovery before other nodes are allowed
+ * to mount the fs. (Others may begin mounting, but they must wait
+ * for the first mounter to be done before taking locks on the fs
+ * or accessing the fs.) This has two parts:
+ *
+ * 1. The mounted_lock tells a node it's the first to mount the fs.
+ * Each node holds the mounted_lock in PR while it's mounted.
+ * Each node tries to acquire the mounted_lock in EX when it mounts.
+ * If a node is granted the mounted_lock EX it means there are no
+ * other mounted nodes (no PR locks exist), and it is the first mounter.
+ * The mounted_lock is demoted to PR when first recovery is done, so
+ * others will fail to get an EX lock, but will get a PR lock.
+ *
+ * 2. The control_lock blocks others in control_mount() while the first
+ * mounter is doing first mount recovery of all journals.
+ * A mounting node needs to acquire control_lock in EX mode before
+ * it can proceed. The first mounter holds control_lock in EX while doing
+ * the first mount recovery, blocking mounts from other nodes, then demotes
+ * control_lock to NL when it's done (others_may_mount/first_done),
+ * allowing other nodes to continue mounting.
+ *
+ * first mounter:
+ * control_lock EX/NOQUEUE success
+ * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
+ * set first=1
+ * do first mounter recovery
+ * mounted_lock EX->PR
+ * control_lock EX->NL, write lvb generation
+ *
+ * other mounter:
+ * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
+ * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
+ * mounted_lock PR/NOQUEUE success
+ * read lvb generation
+ * control_lock EX->NL
+ * set first=0
+ *
+ * - mount during recovery
+ *
+ * If a node mounts while others are doing recovery (not first mounter),
+ * the mounting node will get its initial recover_done() callback without
+ * having seen any previous failures/callbacks.
+ *
+ * It must wait for all recoveries preceding its mount to be finished
+ * before it unblocks locks. It does this by repeating the "other mounter"
+ * steps above until the lvb generation number is >= its mount generation
+ * number (from initial recover_done) and all lvb bits are clear.
+ *
+ * - control_lock lvb format
+ *
+ * 4 bytes generation number: the latest dlm lockspace generation number
+ * from recover_done callback. Indicates the jid bitmap has been updated
+ * to reflect all slot failures through that generation.
+ * 4 bytes unused.
+ * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
+ * that jid N needs recovery.
+ */
+
+#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
+
+static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
+ char *lvb_bits)
+{
+ uint32_t gen;
+ memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
+ memcpy(&gen, lvb_bits, sizeof(uint32_t));
+ *lvb_gen = le32_to_cpu(gen);
+}
+
+static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
+ char *lvb_bits)
+{
+ uint32_t gen;
+ memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
+ gen = cpu_to_le32(lvb_gen);
+ memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+}
+
+static int all_jid_bits_clear(char *lvb)
+{
+ int i;
+ for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
+ if (lvb[i])
+ return 0;
+ }
+ return 1;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct lm_lockstruct *ls = arg;
+ complete(&ls->ls_sync_wait);
+}
+
+static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int error;
- if (fsname == NULL) {
- fs_info(sdp, "no fsname found\n");
- return -EINVAL;
+ error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+ if (error) {
+ fs_err(sdp, "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&ls->ls_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ fs_err(sdp, "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
+ unsigned int num, struct dlm_lksb *lksb, char *name)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char strname[GDLM_STRNAME_BYTES];
+ int error, status;
+
+ memset(strname, 0, GDLM_STRNAME_BYTES);
+ snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
+
+ error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+ strname, GDLM_STRNAME_BYTES - 1,
+ 0, sync_wait_cb, ls, NULL);
+ if (error) {
+ fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&ls->ls_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+static int mounted_unlock(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
+ &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int control_unlock(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
+}
+
+static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
+ &ls->ls_control_lksb, "control_lock");
+}
+
+static void gfs2_control_func(struct work_struct *work)
+{
+ struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t block_gen, start_gen, lvb_gen, flags;
+ int recover_set = 0;
+ int write_lvb = 0;
+ int recover_size;
+ int i, error;
+
+ spin_lock(&ls->ls_recover_spin);
+ /*
+ * No MOUNT_DONE means we're still mounting; control_mount()
+ * will set this flag, after which this thread will take over
+ * all further clearing of BLOCK_LOCKS.
+ *
+ * FIRST_MOUNT means this node is doing first mounter recovery,
+ * for which recovery control is handled by
+ * control_mount()/control_first_done(), not this thread.
+ */
+ if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ block_gen = ls->ls_recover_block;
+ start_gen = ls->ls_recover_start;
+ spin_unlock(&ls->ls_recover_spin);
+
+ /*
+ * Equal block_gen and start_gen implies we are between
+ * recover_prep and recover_done callbacks, which means
+ * dlm recovery is in progress and dlm locking is blocked.
+ * There's no point trying to do any work until recover_done.
+ */
+
+ if (block_gen == start_gen)
+ return;
+
+ /*
+ * Propagate recover_submit[] and recover_result[] to lvb:
+ * dlm_recoverd adds to recover_submit[] jids needing recovery
+ * gfs2_recover adds to recover_result[] journal recovery results
+ *
+ * set lvb bit for jids in recover_submit[] if the lvb has not
+ * yet been updated for the generation of the failure
+ *
+ * clear lvb bit for jids in recover_result[] if the result of
+ * the journal recovery is SUCCESS
+ */
+
+ error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ if (error) {
+ fs_err(sdp, "control lock EX error %d\n", error);
+ return;
+ }
+
+ control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+ spin_lock(&ls->ls_recover_spin);
+ if (block_gen != ls->ls_recover_block ||
+ start_gen != ls->ls_recover_start) {
+ fs_info(sdp, "recover generation %u block1 %u %u\n",
+ start_gen, block_gen, ls->ls_recover_block);
+ spin_unlock(&ls->ls_recover_spin);
+ control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ return;
+ }
+
+ recover_size = ls->ls_recover_size;
+
+ if (lvb_gen <= start_gen) {
+ /*
+ * Clear lvb bits for jids we've successfully recovered.
+ * Because all nodes attempt to recover failed journals,
+ * a journal can be recovered multiple times successfully
+ * in succession. Only the first will really do recovery,
+ * the others find it clean, but still report a successful
+ * recovery. So, another node may have already recovered
+ * the jid and cleared the lvb bit for it.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
+ continue;
+
+ ls->ls_recover_result[i] = 0;
+
+ if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
+ continue;
+
+ __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+ write_lvb = 1;
+ }
+ }
+
+ if (lvb_gen == start_gen) {
+ /*
+ * Failed slots before start_gen are already set in lvb.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (!ls->ls_recover_submit[i])
+ continue;
+ if (ls->ls_recover_submit[i] < lvb_gen)
+ ls->ls_recover_submit[i] = 0;
+ }
+ } else if (lvb_gen < start_gen) {
+ /*
+ * Failed slots before start_gen are not yet set in lvb.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (!ls->ls_recover_submit[i])
+ continue;
+ if (ls->ls_recover_submit[i] < start_gen) {
+ ls->ls_recover_submit[i] = 0;
+ __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+ }
+ }
+ /* even if there are no bits to set, we need to write the
+ latest generation to the lvb */
+ write_lvb = 1;
+ } else {
+ /*
+ * we should be getting a recover_done() for lvb_gen soon
+ */
+ }
+ spin_unlock(&ls->ls_recover_spin);
+
+ if (write_lvb) {
+ control_lvb_write(ls, start_gen, lvb_bits);
+ flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
+ } else {
+ flags = DLM_LKF_CONVERT;
+ }
+
+ error = control_lock(sdp, DLM_LOCK_NL, flags);
+ if (error) {
+ fs_err(sdp, "control lock NL error %d\n", error);
+ return;
+ }
+
+ /*
+ * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
+ * and clear a jid bit in the lvb if the recovery is a success.
+ * Eventually all journals will be recovered, all jid bits will
+ * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
+ */
+
+ for (i = 0; i < recover_size; i++) {
+ if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
+ fs_info(sdp, "recover generation %u jid %d\n",
+ start_gen, i);
+ gfs2_recover_set(sdp, i);
+ recover_set++;
+ }
+ }
+ if (recover_set)
+ return;
+
+ /*
+ * No more jid bits set in lvb, all recovery is done, unblock locks
+ * (unless a new recover_prep callback has occured blocking locks
+ * again while working above)
+ */
+
+ spin_lock(&ls->ls_recover_spin);
+ if (ls->ls_recover_block == block_gen &&
+ ls->ls_recover_start == start_gen) {
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "recover generation %u done\n", start_gen);
+ gfs2_glock_thaw(sdp);
+ } else {
+ fs_info(sdp, "recover generation %u block2 %u %u\n",
+ start_gen, block_gen, ls->ls_recover_block);
+ spin_unlock(&ls->ls_recover_spin);
+ }
+}
+
+static int control_mount(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t start_gen, block_gen, mount_gen, lvb_gen;
+ int mounted_mode;
+ int retries = 0;
+ int error;
+
+ memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
+ memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
+ memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
+ ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
+ init_completion(&ls->ls_sync_wait);
+
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
+ if (error) {
+ fs_err(sdp, "control_mount control_lock NL error %d\n", error);
+ return error;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_NL, 0);
+ if (error) {
+ fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
+ control_unlock(sdp);
+ return error;
+ }
+ mounted_mode = DLM_LOCK_NL;
+
+restart:
+ if (retries++ && signal_pending(current)) {
+ error = -EINTR;
+ goto fail;
+ }
+
+ /*
+ * We always start with both locks in NL. control_lock is
+ * demoted to NL below so we don't need to do it here.
+ */
+
+ if (mounted_mode != DLM_LOCK_NL) {
+ error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ if (error)
+ goto fail;
+ mounted_mode = DLM_LOCK_NL;
+ }
+
+ /*
+ * Other nodes need to do some work in dlm recovery and gfs2_control
+ * before the recover_done and control_lock will be ready for us below.
+ * A delay here is not required but often avoids having to retry.
+ */
+
+ msleep_interruptible(500);
+
+ /*
+ * Acquire control_lock in EX and mounted_lock in either EX or PR.
+ * control_lock lvb keeps track of any pending journal recoveries.
+ * mounted_lock indicates if any other nodes have the fs mounted.
+ */
+
+ error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
+ if (error == -EAGAIN) {
+ goto restart;
+ } else if (error) {
+ fs_err(sdp, "control_mount control_lock EX error %d\n", error);
+ goto fail;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+ if (!error) {
+ mounted_mode = DLM_LOCK_EX;
+ goto locks_done;
+ } else if (error != -EAGAIN) {
+ fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
+ goto fail;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+ if (!error) {
+ mounted_mode = DLM_LOCK_PR;
+ goto locks_done;
+ } else {
+ /* not even -EAGAIN should happen here */
+ fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
+ goto fail;
+ }
+
+locks_done:
+ /*
+ * If we got both locks above in EX, then we're the first mounter.
+ * If not, then we need to wait for the control_lock lvb to be
+ * updated by other mounted nodes to reflect our mount generation.
+ *
+ * In simple first mounter cases, first mounter will see zero lvb_gen,
+ * but in cases where all existing nodes leave/fail before mounting
+ * nodes finish control_mount, then all nodes will be mounting and
+ * lvb_gen will be non-zero.
+ */
+
+ control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+ if (lvb_gen == 0xFFFFFFFF) {
+ /* special value to force mount attempts to fail */
+ fs_err(sdp, "control_mount control_lock disabled\n");
+ error = -EINVAL;
+ goto fail;
+ }
+
+ if (mounted_mode == DLM_LOCK_EX) {
+ /* first mounter, keep both EX while doing first recovery */
+ spin_lock(&ls->ls_recover_spin);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+ set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
+ return 0;
+ }
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ if (error)
+ goto fail;
+
+ /*
+ * We are not first mounter, now we need to wait for the control_lock
+ * lvb generation to be >= the generation from our first recover_done
+ * and all lvb bits to be clear (no pending journal recoveries.)
+ */
+
+ if (!all_jid_bits_clear(lvb_bits)) {
+ /* journals need recovery, wait until all are clear */
+ fs_info(sdp, "control_mount wait for journal recovery\n");
+ goto restart;
+ }
+
+ spin_lock(&ls->ls_recover_spin);
+ block_gen = ls->ls_recover_block;
+ start_gen = ls->ls_recover_start;
+ mount_gen = ls->ls_recover_mount;
+
+ if (lvb_gen < mount_gen) {
+ /* wait for mounted nodes to update control_lock lvb to our
+ generation, which might include new recovery bits set */
+ fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
+ }
+
+ if (lvb_gen != start_gen) {
+ /* wait for mounted nodes to update control_lock lvb to the
+ latest recovery generation */
+ fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
+ }
+
+ if (block_gen == start_gen) {
+ /* dlm recovery in progress, wait for it to finish */
+ fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
}
- error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
- DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
- (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
- GDLM_LVB_SIZE);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+ memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+ memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+ spin_unlock(&ls->ls_recover_spin);
+ return 0;
+
+fail:
+ mounted_unlock(sdp);
+ control_unlock(sdp);
+ return error;
+}
+
+static int dlm_recovery_wait(void *word)
+{
+ schedule();
+ return 0;
+}
+
+static int control_first_done(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t start_gen, block_gen;
+ int error;
+
+restart:
+ spin_lock(&ls->ls_recover_spin);
+ start_gen = ls->ls_recover_start;
+ block_gen = ls->ls_recover_block;
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
+ !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ /* sanity check, should not happen */
+ fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
+ start_gen, block_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ control_unlock(sdp);
+ return -1;
+ }
+
+ if (start_gen == block_gen) {
+ /*
+ * Wait for the end of a dlm recovery cycle to switch from
+ * first mounter recovery. We can ignore any recover_slot
+ * callbacks between the recover_prep and next recover_done
+ * because we are still the first mounter and any failed nodes
+ * have not fully mounted, so they don't need recovery.
+ */
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
+
+ wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
+ dlm_recovery_wait, TASK_UNINTERRUPTIBLE);
+ goto restart;
+ }
+
+ clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
+ memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+ memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+ spin_unlock(&ls->ls_recover_spin);
+
+ memset(lvb_bits, 0, sizeof(lvb_bits));
+ control_lvb_write(ls, start_gen, lvb_bits);
+
+ error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
+ if (error)
+ fs_err(sdp, "control_first_done mounted PR error %d\n", error);
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
if (error)
- printk(KERN_ERR "dlm_new_lockspace error %d", error);
+ fs_err(sdp, "control_first_done control NL error %d\n", error);
return error;
}
+/*
+ * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
+ * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
+ * gfs2 jids start at 0, so jid = slot - 1)
+ */
+
+#define RECOVER_SIZE_INC 16
+
+static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
+ int num_slots)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ uint32_t *submit = NULL;
+ uint32_t *result = NULL;
+ uint32_t old_size, new_size;
+ int i, max_jid;
+
+ max_jid = 0;
+ for (i = 0; i < num_slots; i++) {
+ if (max_jid < slots[i].slot - 1)
+ max_jid = slots[i].slot - 1;
+ }
+
+ old_size = ls->ls_recover_size;
+
+ if (old_size >= max_jid + 1)
+ return 0;
+
+ new_size = old_size + RECOVER_SIZE_INC;
+
+ submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+ result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+ if (!submit || !result) {
+ kfree(submit);
+ kfree(result);
+ return -ENOMEM;
+ }
+
+ spin_lock(&ls->ls_recover_spin);
+ memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
+ memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
+ kfree(ls->ls_recover_submit);
+ kfree(ls->ls_recover_result);
+ ls->ls_recover_submit = submit;
+ ls->ls_recover_result = result;
+ ls->ls_recover_size = new_size;
+ spin_unlock(&ls->ls_recover_spin);
+ return 0;
+}
+
+static void free_recover_size(struct lm_lockstruct *ls)
+{
+ kfree(ls->ls_recover_submit);
+ kfree(ls->ls_recover_result);
+ ls->ls_recover_submit = NULL;
+ ls->ls_recover_result = NULL;
+ ls->ls_recover_size = 0;
+}
+
+/* dlm calls before it does lock recovery */
+
+static void gdlm_recover_prep(void *arg)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ spin_lock(&ls->ls_recover_spin);
+ ls->ls_recover_block = ls->ls_recover_start;
+ set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+
+ if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_prep has been completed on all lockspace members;
+ identifies slot/jid of failed member */
+
+static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int jid = slot->slot - 1;
+
+ spin_lock(&ls->ls_recover_spin);
+ if (ls->ls_recover_size < jid + 1) {
+ fs_err(sdp, "recover_slot jid %d gen %u short size %d",
+ jid, ls->ls_recover_block, ls->ls_recover_size);
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+
+ if (ls->ls_recover_submit[jid]) {
+ fs_info(sdp, "recover_slot jid %d gen %u prev %u",
+ jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
+ }
+ ls->ls_recover_submit[jid] = ls->ls_recover_block;
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_slot and after it completes lock recovery */
+
+static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
+ int our_slot, uint32_t generation)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ /* ensure the ls jid arrays are large enough */
+ set_recover_size(sdp, slots, num_slots);
+
+ spin_lock(&ls->ls_recover_spin);
+ ls->ls_recover_start = generation;
+
+ if (!ls->ls_recover_mount) {
+ ls->ls_recover_mount = generation;
+ ls->ls_jid = our_slot - 1;
+ }
+
+ if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+
+ clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* gfs2_recover thread has a journal recovery result */
+
+static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int result)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ return;
+
+ /* don't care about the recovery of own journal during mount */
+ if (jid == ls->ls_jid)
+ return;
+
+ spin_lock(&ls->ls_recover_spin);
+ if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ if (ls->ls_recover_size < jid + 1) {
+ fs_err(sdp, "recovery_result jid %d short size %d",
+ jid, ls->ls_recover_size);
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+
+ fs_info(sdp, "recover jid %d result %s\n", jid,
+ result == LM_RD_GAVEUP ? "busy" : "success");
+
+ ls->ls_recover_result[jid] = result;
+
+ /* GAVEUP means another node is recovering the journal; delay our
+ next attempt to recover it, to give the other node a chance to
+ finish before trying again */
+
+ if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
+ result == LM_RD_GAVEUP ? HZ : 0);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+const struct dlm_lockspace_ops gdlm_lockspace_ops = {
+ .recover_prep = gdlm_recover_prep,
+ .recover_slot = gdlm_recover_slot,
+ .recover_done = gdlm_recover_done,
+};
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char cluster[GFS2_LOCKNAME_LEN];
+ const char *fsname;
+ uint32_t flags;
+ int error, ops_result;
+
+ /*
+ * initialize everything
+ */
+
+ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+ spin_lock_init(&ls->ls_recover_spin);
+ ls->ls_recover_flags = 0;
+ ls->ls_recover_mount = 0;
+ ls->ls_recover_start = 0;
+ ls->ls_recover_block = 0;
+ ls->ls_recover_size = 0;
+ ls->ls_recover_submit = NULL;
+ ls->ls_recover_result = NULL;
+
+ error = set_recover_size(sdp, NULL, 0);
+ if (error)
+ goto fail;
+
+ /*
+ * prepare dlm_new_lockspace args
+ */
+
+ fsname = strchr(table, ':');
+ if (!fsname) {
+ fs_info(sdp, "no fsname found\n");
+ error = -EINVAL;
+ goto fail_free;
+ }
+ memset(cluster, 0, sizeof(cluster));
+ memcpy(cluster, table, strlen(table) - strlen(fsname));
+ fsname++;
+
+ flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+ if (ls->ls_nodir)
+ flags |= DLM_LSFL_NODIR;
+
+ /*
+ * create/join lockspace
+ */
+
+ error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
+ &gdlm_lockspace_ops, sdp, &ops_result,
+ &ls->ls_dlm);
+ if (error) {
+ fs_err(sdp, "dlm_new_lockspace error %d\n", error);
+ goto fail_free;
+ }
+
+ if (ops_result < 0) {
+ /*
+ * dlm does not support ops callbacks,
+ * old dlm_controld/gfs_controld are used, try without ops.
+ */
+ fs_info(sdp, "dlm lockspace ops not used\n");
+ free_recover_size(ls);
+ set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
+ return 0;
+ }
+
+ if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
+ fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
+ error = -EINVAL;
+ goto fail_release;
+ }
+
+ /*
+ * control_mount() uses control_lock to determine first mounter,
+ * and for later mounts, waits for any recoveries to be cleared.
+ */
+
+ error = control_mount(sdp);
+ if (error) {
+ fs_err(sdp, "mount control error %d\n", error);
+ goto fail_release;
+ }
+
+ ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+ return 0;
+
+fail_release:
+ dlm_release_lockspace(ls->ls_dlm, 2);
+fail_free:
+ free_recover_size(ls);
+fail:
+ return error;
+}
+
+static void gdlm_first_done(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int error;
+
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ return;
+
+ error = control_first_done(sdp);
+ if (error)
+ fs_err(sdp, "mount first_done error %d\n", error);
+}
+
static void gdlm_unmount(struct gfs2_sbd *sdp)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ goto release;
+
+ /* wait for gfs2_control_wq to be done with this mount */
+
+ spin_lock(&ls->ls_recover_spin);
+ set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ flush_delayed_work_sync(&sdp->sd_control_work);
+
+ /* mounted_lock and control_lock will be purged in dlm recovery */
+release:
if (ls->ls_dlm) {
dlm_release_lockspace(ls->ls_dlm, 2);
ls->ls_dlm = NULL;
}
+
+ free_recover_size(ls);
}
static const match_table_t dlm_tokens = {
@@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = {
const struct lm_lockops gfs2_dlm_ops = {
.lm_proto_name = "lock_dlm",
.lm_mount = gdlm_mount,
+ .lm_first_done = gdlm_first_done,
+ .lm_recovery_result = gdlm_recovery_result,
.lm_unmount = gdlm_unmount,
.lm_put_lock = gdlm_put_lock,
.lm_lock = gdlm_lock,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c150298e2d8e..a8d9bcd0e19c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -28,6 +28,8 @@
#include "recovery.h"
#include "dir.h"
+struct workqueue_struct *gfs2_control_wq;
+
static struct shrinker qd_shrinker = {
.shrink = gfs2_shrink_qd_memory,
.seeks = DEFAULT_SEEKS,
@@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void)
if (!gfs_recovery_wq)
goto fail_wq;
+ gfs2_control_wq = alloc_workqueue("gfs2_control",
+ WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
+ if (!gfs2_control_wq)
+ goto fail_control;
+
gfs2_register_debugfs();
printk("GFS2 installed\n");
return 0;
+fail_control:
+ destroy_workqueue(gfs_recovery_wq);
fail_wq:
unregister_filesystem(&gfs2meta_fs_type);
fail_unregister:
@@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
destroy_workqueue(gfs_recovery_wq);
+ destroy_workqueue(gfs2_control_wq);
rcu_barrier();
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index fe72e79e6ff9..6aacf3f230a2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
{
char *message = "FIRSTMOUNT=Done";
char *envp[] = { message, NULL };
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- ls->ls_first_done = 1;
+
+ fs_info(sdp, "first mount done, others may mount\n");
+
+ if (sdp->sd_lockstruct.ls_ops->lm_first_done)
+ sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
+
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
}
@@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
struct gfs2_args *args = &sdp->sd_args;
const char *proto = sdp->sd_proto_name;
const char *table = sdp->sd_table_name;
- const char *fsname;
char *o, *options;
int ret;
@@ -1004,21 +1007,12 @@ hostdata_error:
}
}
- if (sdp->sd_args.ar_spectator)
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
- else
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
- sdp->sd_lockstruct.ls_jid);
-
- fsname = strchr(table, ':');
- if (fsname)
- fsname++;
if (lm->lm_mount == NULL) {
fs_info(sdp, "Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
return 0;
}
- ret = lm->lm_mount(sdp, fsname);
+ ret = lm->lm_mount(sdp, table);
if (ret == 0)
fs_info(sdp, "Joined cluster. Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
@@ -1084,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (sdp->sd_args.ar_spectator) {
sb->s_flags |= MS_RDONLY;
- set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+ set_bit(SDF_RORECOVERY, &sdp->sd_flags);
}
if (sdp->sd_args.ar_posix_acl)
sb->s_flags |= MS_POSIXACL;
@@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (error)
goto fail;
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+
gfs2_create_debugfs_file(sdp);
error = gfs2_sys_fs_add(sdp);
@@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
goto fail_sb;
}
+ if (sdp->sd_args.ar_spectator)
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+ sdp->sd_table_name);
+ else
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+ sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
+
error = init_inodes(sdp, DO);
if (error)
goto fail_sb;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f2a02edcac8f..963b2d75200c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
char env_status[20];
char *envp[] = { env_jid, env_status, NULL };
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
ls->ls_recover_jid_done = jid;
ls->ls_recover_jid_status = message;
sprintf(env_jid, "JID=%d", jid);
sprintf(env_status, "RECOVERY=%s",
message == LM_RD_SUCCESS ? "Done" : "Failed");
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+
+ if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
+ sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
}
void gfs2_recover_func(struct work_struct *work)
@@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work)
if (error)
goto fail_gunlock_ji;
- if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+ if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
+ ro = 1;
+ } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
ro = 1;
} else {
@@ -577,6 +583,7 @@ fail_gunlock_j:
fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
fail:
+ jd->jd_recover_error = error;
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
done:
clear_bit(JDF_RECOVERY, &jd->jd_flags);
@@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
TASK_UNINTERRUPTIBLE);
- return 0;
+ return wait ? jd->jd_recover_error : 0;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 22234627f684..981bfa32121a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1108,9 +1108,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
{
struct gfs2_blkreserv *rs = ip->i_res;
- gfs2_blkrsv_put(ip);
if (rs->rs_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+ gfs2_blkrsv_put(ip);
}
/**
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 443cabcfcd23..d33172c291ba 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
ssize_t ret;
int val = 0;
- if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
val = 1;
ret = sprintf(buf, "%d\n", val);
return ret;
@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
val = simple_strtol(buf, NULL, 0);
if (val == 1)
- set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
else if (val == 0) {
- clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
smp_mb__after_clear_bit();
gfs2_glock_thaw(sdp);
} else {
@@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
goto out;
if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
goto out;
- sdp->sd_lockstruct.ls_first = first;
- rv = 0;
+ sdp->sd_lockstruct.ls_first = first;
+ rv = 0;
out:
spin_unlock(&sdp->sd_jindex_spin);
return rv ? rv : len;
@@ -360,19 +360,14 @@ out:
static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- return sprintf(buf, "%d\n", ls->ls_first_done);
+ return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
}
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
{
- unsigned jid;
struct gfs2_jdesc *jd;
int rv;
- rv = sscanf(buf, "%u", &jid);
- if (rv != 1)
- return -EINVAL;
-
rv = -ESHUTDOWN;
spin_lock(&sdp->sd_jindex_spin);
if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
}
out:
spin_unlock(&sdp->sd_jindex_spin);
+ return rv;
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ unsigned jid;
+ int rv;
+
+ rv = sscanf(buf, "%u", &jid);
+ if (rv != 1)
+ return -EINVAL;
+
+ rv = gfs2_recover_set(sdp, jid);
+
return rv ? rv : len;
}
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index e94560e836d7..79182d6ad6ac 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
int gfs2_sys_init(void);
void gfs2_sys_uninit(void);
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
+
#endif /* __SYS_DOT_H__ */
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index edf0a801446b..427682ca9e48 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi->hidden_dir) {
mutex_lock(&sbi->vh_mutex);
sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
- hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
- sbi->hidden_dir);
+ if (!sbi->hidden_dir) {
+ mutex_unlock(&sbi->vh_mutex);
+ err = -ENOMEM;
+ goto out_put_root;
+ }
+ err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
+ &str, sbi->hidden_dir);
mutex_unlock(&sbi->vh_mutex);
+ if (err)
+ goto out_put_hidden_dir;
hfsplus_mark_inode_dirty(sbi->hidden_dir,
HFSPLUS_I_CAT_DIRTY);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d0490..1e85a7ac0217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
}
static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;
diff --git a/fs/inode.c b/fs/inode.c
index 87535753ab04..4fa4f0916af9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -776,6 +776,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
else
__count_vm_events(PGINODESTEAL, reap);
spin_unlock(&sb->s_inode_lru_lock);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += reap;
dispose_list(&freeable);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 68d704db787f..5069b8475150 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -430,6 +430,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD2: commit phase 1\n");
/*
+ * Clear revoked flag to reflect there is no revoked buffers
+ * in the next transaction which is going to be started.
+ */
+ jbd2_clear_buffer_revoked_flags(journal);
+
+ /*
* Switch to a new revoke table.
*/
jbd2_journal_switch_revoke_table(journal);
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 69fd93588118..30b2867d6cc9 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -47,6 +47,10 @@
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits. As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
@@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
return did_revoke;
}
+/*
+ * journal_clear_revoked_flag clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffers in the next
+ * transaction which is going to be started.
+ */
+void jbd2_clear_buffer_revoked_flags(journal_t *journal)
+{
+ struct jbd2_revoke_table_s *revoke = journal->j_revoke;
+ int i = 0;
+
+ for (i = 0; i < revoke->hash_size; i++) {
+ struct list_head *hash_list;
+ struct list_head *list_entry;
+ hash_list = &revoke->hash_table[i];
+
+ list_for_each(list_entry, hash_list) {
+ struct jbd2_revoke_record_s *record;
+ struct buffer_head *bh;
+ record = (struct jbd2_revoke_record_s *)list_entry;
+ bh = __find_get_block(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
+ if (bh) {
+ clear_buffer_revoked(bh);
+ __brelse(bh);
+ }
+ }
+ }
+}
+
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0e41a4c080e..35ae096bed5d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal)
break;
spin_lock(&transaction->t_handle_lock);
+ prepare_to_wait(&journal->j_wait_updates, &wait,
+ TASK_UNINTERRUPTIBLE);
if (!atomic_read(&transaction->t_updates)) {
spin_unlock(&transaction->t_handle_lock);
+ finish_wait(&journal->j_wait_updates, &wait);
break;
}
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
spin_unlock(&transaction->t_handle_lock);
write_unlock(&journal->j_state_lock);
schedule();
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 43926add945b..54cea8ad5a76 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
dprintk("%s enter. slotid %d seqid %d\n",
__func__, args->csa_slotid, args->csa_sequenceid);
- if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
+ if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
return htonl(NFS4ERR_BADSLOT);
slot = tbl->slots + args->csa_slotid;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 873bf00d51a2..277dfaf2e99a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
/*
* Turn off NFSv4 uid/gid mapping when using AUTH_SYS
*/
-static int nfs4_disable_idmapping = 0;
+static int nfs4_disable_idmapping = 1;
/*
* RPC cruft for NFS
@@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
clp->cl_minorversion = cl_init->minorversion;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
#endif
- cred = rpc_lookup_machine_cred();
+ cred = rpc_lookup_machine_cred("*");
if (!IS_ERR(cred))
clp->cl_machine_cred = cred;
nfs_fscache_get_client_cookie(clp);
@@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server)
rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
}
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+ nfs4_purge_state_owners(server);
+}
+
#else
static void nfs4_shutdown_client(struct nfs_client *clp)
{
@@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void)
INIT_LIST_HEAD(&server->master_link);
INIT_LIST_HEAD(&server->delegations);
INIT_LIST_HEAD(&server->layouts);
+ INIT_LIST_HEAD(&server->state_owners_lru);
atomic_set(&server->active, 0);
@@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
nfs_server_insert_lists(server);
server->mount_time = jiffies;
+ server->destroy = nfs4_destroy_server;
out:
nfs_free_fattr(fattr);
return error;
@@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
/* Copy data from the source */
server->nfs_client = source->nfs_client;
+ server->destroy = source->destroy;
atomic_inc(&server->nfs_client->cl_count);
nfs_server_copy_userdata(server, source);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 606ef0f20aed..c43a452f7da2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
datasync);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
mutex_lock(&inode->i_mutex);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
status = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (status >= 0 && ret < 0)
+ status = ret;
have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
if (have_error)
ret = xchg(&ctx->error, 0);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 47d1c6ff2d8e..2c05f1991e1e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -38,6 +38,89 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name)
+{
+ fattr->owner_name = owner_name;
+ fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+ kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+ kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *owner = fattr->owner_name;
+ __u32 uid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+ return false;
+ if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+ fattr->uid = uid;
+ fattr->valid |= NFS_ATTR_FATTR_OWNER;
+ }
+ return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *group = fattr->group_name;
+ __u32 gid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+ return false;
+ if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+ fattr->gid = gid;
+ fattr->valid |= NFS_ATTR_FATTR_GROUP;
+ }
+ return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+ nfs_fattr_free_owner_name(fattr);
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+ nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ if (nfs_fattr_map_owner_name(server, fattr))
+ nfs_fattr_free_owner_name(fattr);
+ if (nfs_fattr_map_group_name(server, fattr))
+ nfs_fattr_free_group_name(fattr);
+}
static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 81db25e92e10..25c3bfad7953 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1020,6 +1020,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
fattr->valid = 0;
fattr->time_start = jiffies;
fattr->gencount = nfs_inc_attr_generation_counter();
+ fattr->owner_name = NULL;
+ fattr->group_name = NULL;
}
struct nfs_fattr *nfs_alloc_fattr(void)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3f4d95751d52..8102db9b926c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
/* write.c */
extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
struct list_head *head);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags);
extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
extern void nfs_writedata_release(struct nfs_write_data *wdata);
extern void nfs_commit_free(struct nfs_write_data *p);
@@ -330,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *, enum migrate_mode);
#else
#define nfs_migrate_page NULL
#endif
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 693ae22f8731..4d7d0aedc101 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -94,6 +94,8 @@ struct nfs_unique_id {
struct nfs4_state_owner {
struct nfs_unique_id so_owner_id;
struct nfs_server *so_server;
+ struct list_head so_lru;
+ unsigned long so_expires;
struct rb_node so_server_node;
struct rpc_cred *so_cred; /* Associated cred */
@@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
extern void nfs4_put_open_state(struct nfs4_state *);
extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a62d36b9a99e..71ec08617e23 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
loff_t offset)
{
u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
- u64 tmp;
+ u64 stripe_no;
+ u32 rem;
offset -= flseg->pattern_offset;
- tmp = offset;
- do_div(tmp, stripe_width);
+ stripe_no = div_u64(offset, stripe_width);
+ div_u64_rem(offset, flseg->stripe_unit, &rem);
- return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+ return stripe_no * flseg->stripe_unit + rem;
}
/* This function is used by the layout driver to calculate the
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dcda0ba7af60..75366dc89686 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,6 +52,7 @@
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/module.h>
+#include <linux/nfs_idmap.h>
#include <linux/sunrpc/bc_xprt.h>
#include <linux/xattr.h>
#include <linux/utsname.h>
@@ -364,9 +365,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
* Must be called while holding tbl->slot_tbl_lock
*/
static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
+nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
{
- int free_slotid = free_slot - tbl->slots;
int slotid = free_slotid;
BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
@@ -431,7 +431,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
}
spin_lock(&tbl->slot_tbl_lock);
- nfs4_free_slot(tbl, res->sr_slot);
+ nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
nfs4_check_drain_fc_complete(res->sr_session);
spin_unlock(&tbl->slot_tbl_lock);
res->sr_slot = NULL;
@@ -554,13 +554,10 @@ int nfs41_setup_sequence(struct nfs4_session *session,
spin_lock(&tbl->slot_tbl_lock);
if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
!rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
- /*
- * The state manager will wait until the slot table is empty.
- * Schedule the reset thread
- */
+ /* The state manager will wait until the slot table is empty */
rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
spin_unlock(&tbl->slot_tbl_lock);
- dprintk("%s Schedule Session Reset\n", __func__);
+ dprintk("%s session is draining\n", __func__);
return -EAGAIN;
}
@@ -765,6 +762,8 @@ struct nfs4_opendata {
struct nfs_openres o_res;
struct nfs_open_confirmargs c_arg;
struct nfs_open_confirmres c_res;
+ struct nfs4_string owner_name;
+ struct nfs4_string group_name;
struct nfs_fattr f_attr;
struct nfs_fattr dir_attr;
struct dentry *dir;
@@ -788,6 +787,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
p->o_res.server = p->o_arg.server;
nfs_fattr_init(&p->f_attr);
nfs_fattr_init(&p->dir_attr);
+ nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
}
static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
@@ -819,6 +819,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.name = &dentry->d_name;
p->o_arg.server = server;
p->o_arg.bitmask = server->attr_bitmask;
+ p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
if (flags & O_CREAT) {
u32 *s;
@@ -855,6 +856,7 @@ static void nfs4_opendata_free(struct kref *kref)
dput(p->dir);
dput(p->dentry);
nfs_sb_deactive(sb);
+ nfs_fattr_free_names(&p->f_attr);
kfree(p);
}
@@ -1579,6 +1581,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
if (status != 0 || !data->rpc_done)
return status;
+ nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
+
nfs_refresh_inode(dir, o_res->dir_attr);
if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1611,6 +1615,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return status;
}
+ nfs_fattr_map_and_free_names(server, &data->f_attr);
+
if (o_arg->open_flags & O_CREAT) {
update_changeattr(dir, &o_res->cinfo);
nfs_post_op_update_inode(dir, o_res->dir_attr);
@@ -3431,19 +3437,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
*/
#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
-static void buf_to_pages(const void *buf, size_t buflen,
- struct page **pages, unsigned int *pgbase)
-{
- const void *p = buf;
-
- *pgbase = offset_in_page(buf);
- p -= *pgbase;
- while (p < buf + buflen) {
- *(pages++) = virt_to_page(p);
- p += PAGE_CACHE_SIZE;
- }
-}
-
static int buf_to_pages_noslab(const void *buf, size_t buflen,
struct page **pages, unsigned int *pgbase)
{
@@ -3540,9 +3533,19 @@ out:
nfs4_set_cached_acl(inode, acl);
}
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf. On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
{
- struct page *pages[NFS4ACL_MAXPAGES];
+ struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
struct nfs_getaclargs args = {
.fh = NFS_FH(inode),
.acl_pages = pages,
@@ -3557,41 +3560,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
.rpc_argp = &args,
.rpc_resp = &res,
};
- struct page *localpage = NULL;
- int ret;
+ int ret = -ENOMEM, npages, i, acl_len = 0;
- if (buflen < PAGE_SIZE) {
- /* As long as we're doing a round trip to the server anyway,
- * let's be prepared for a page of acl data. */
- localpage = alloc_page(GFP_KERNEL);
- resp_buf = page_address(localpage);
- if (localpage == NULL)
- return -ENOMEM;
- args.acl_pages[0] = localpage;
- args.acl_pgbase = 0;
- args.acl_len = PAGE_SIZE;
- } else {
- resp_buf = buf;
- buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
+ npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ /* As long as we're doing a round trip to the server anyway,
+ * let's be prepared for a page of acl data. */
+ if (npages == 0)
+ npages = 1;
+
+ for (i = 0; i < npages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto out_free;
}
- ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+ if (npages > 1) {
+ /* for decoding across pages */
+ args.acl_scratch = alloc_page(GFP_KERNEL);
+ if (!args.acl_scratch)
+ goto out_free;
+ }
+ args.acl_len = npages * PAGE_SIZE;
+ args.acl_pgbase = 0;
+ /* Let decode_getfacl know not to fail if the ACL data is larger than
+ * the page we send as a guess */
+ if (buf == NULL)
+ res.acl_flags |= NFS4_ACL_LEN_REQUEST;
+ resp_buf = page_address(pages[0]);
+
+ dprintk("%s buf %p buflen %ld npages %d args.acl_len %ld\n",
+ __func__, buf, buflen, npages, args.acl_len);
+ ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+ &msg, &args.seq_args, &res.seq_res, 0);
if (ret)
goto out_free;
- if (res.acl_len > args.acl_len)
- nfs4_write_cached_acl(inode, NULL, res.acl_len);
+
+ acl_len = res.acl_len - res.acl_data_offset;
+ if (acl_len > args.acl_len)
+ nfs4_write_cached_acl(inode, NULL, acl_len);
else
- nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
+ nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset,
+ acl_len);
if (buf) {
ret = -ERANGE;
- if (res.acl_len > buflen)
+ if (acl_len > buflen)
goto out_free;
- if (localpage)
- memcpy(buf, resp_buf, res.acl_len);
+ _copy_from_pages(buf, pages, res.acl_data_offset,
+ res.acl_len);
}
- ret = res.acl_len;
+ ret = acl_len;
out_free:
- if (localpage)
- __free_page(localpage);
+ for (i = 0; i < npages; i++)
+ if (pages[i])
+ __free_page(pages[i]);
+ if (args.acl_scratch)
+ __free_page(args.acl_scratch);
return ret;
}
@@ -3622,6 +3644,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
nfs_zap_acl_cache(inode);
ret = nfs4_read_cached_acl(inode, buf, buflen);
if (ret != -ENOENT)
+ /* -ENOENT is returned if there is no ACL or if there is an ACL
+ * but no cached acl data, just the acl length */
return ret;
return nfs4_get_acl_uncached(inode, buf, buflen);
}
@@ -5022,23 +5046,6 @@ out:
return ret;
}
-/*
- * Reset the forechannel and backchannel slot tables
- */
-static int nfs4_reset_slot_tables(struct nfs4_session *session)
-{
- int status;
-
- status = nfs4_reset_slot_table(&session->fc_slot_table,
- session->fc_attrs.max_reqs, 1);
- if (status)
- return status;
-
- status = nfs4_reset_slot_table(&session->bc_slot_table,
- session->bc_attrs.max_reqs, 0);
- return status;
-}
-
/* Destroy the slot table */
static void nfs4_destroy_slot_tables(struct nfs4_session *session)
{
@@ -5084,29 +5091,35 @@ out:
}
/*
- * Initialize the forechannel and backchannel tables
+ * Initialize or reset the forechannel and backchannel tables
*/
-static int nfs4_init_slot_tables(struct nfs4_session *session)
+static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
{
struct nfs4_slot_table *tbl;
- int status = 0;
+ int status;
- tbl = &session->fc_slot_table;
+ dprintk("--> %s\n", __func__);
+ /* Fore channel */
+ tbl = &ses->fc_slot_table;
if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl,
- session->fc_attrs.max_reqs, 1);
+ status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+ if (status) /* -ENOMEM */
+ return status;
+ } else {
+ status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
if (status)
return status;
}
-
- tbl = &session->bc_slot_table;
+ /* Back channel */
+ tbl = &ses->bc_slot_table;
if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl,
- session->bc_attrs.max_reqs, 0);
+ status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
if (status)
- nfs4_destroy_slot_tables(session);
- }
-
+ /* Fore and back channel share a connection so get
+ * both slot tables or neither */
+ nfs4_destroy_slot_tables(ses);
+ } else
+ status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
return status;
}
@@ -5294,13 +5307,9 @@ int nfs4_proc_create_session(struct nfs_client *clp)
if (status)
goto out;
- /* Init and reset the fore channel */
- status = nfs4_init_slot_tables(session);
- dprintk("slot table initialization returned %d\n", status);
- if (status)
- goto out;
- status = nfs4_reset_slot_tables(session);
- dprintk("slot table reset returned %d\n", status);
+ /* Init or reset the session slot tables */
+ status = nfs4_setup_session_slot_tables(session);
+ dprintk("slot table setup returned %d\n", status);
if (status)
goto out;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6a7107ae6b72..a53f33b4ac3a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
#include <linux/ratelimit.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
+#include <linux/jiffies.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -377,31 +378,24 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
{
struct rb_node **p = &server->state_owners.rb_node,
*parent = NULL;
- struct nfs4_state_owner *sp, *res = NULL;
+ struct nfs4_state_owner *sp;
while (*p != NULL) {
parent = *p;
sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
- if (server < sp->so_server) {
- p = &parent->rb_left;
- continue;
- }
- if (server > sp->so_server) {
- p = &parent->rb_right;
- continue;
- }
if (cred < sp->so_cred)
p = &parent->rb_left;
else if (cred > sp->so_cred)
p = &parent->rb_right;
else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
atomic_inc(&sp->so_count);
- res = sp;
- break;
+ return sp;
}
}
- return res;
+ return NULL;
}
static struct nfs4_state_owner *
@@ -421,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
else if (new->so_cred > sp->so_cred)
p = &parent->rb_right;
else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
atomic_inc(&sp->so_count);
return sp;
}
@@ -462,6 +458,7 @@ nfs4_alloc_state_owner(void)
spin_lock_init(&sp->so_sequence.lock);
INIT_LIST_HEAD(&sp->so_sequence.list);
atomic_set(&sp->so_count, 1);
+ INIT_LIST_HEAD(&sp->so_lru);
return sp;
}
@@ -479,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
}
}
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+ rpc_destroy_wait_queue(&sp->so_sequence.wait);
+ put_rpccred(sp->so_cred);
+ kfree(sp);
+}
+
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+ unsigned long time_min, time_max;
+ LIST_HEAD(doomed);
+
+ spin_lock(&clp->cl_lock);
+ time_max = jiffies;
+ time_min = (long)time_max - (long)clp->cl_lease_time;
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ /* NB: LRU is sorted so that oldest is at the head */
+ if (time_in_range(sp->so_expires, time_min, time_max))
+ break;
+ list_move(&sp->so_lru, &doomed);
+ nfs4_remove_state_owner_locked(sp);
+ }
+ spin_unlock(&clp->cl_lock);
+
+ list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
+}
+
/**
* nfs4_get_state_owner - Look up a state owner given a credential
* @server: nfs_server to search
@@ -496,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
sp = nfs4_find_state_owner_locked(server, cred);
spin_unlock(&clp->cl_lock);
if (sp != NULL)
- return sp;
+ goto out;
new = nfs4_alloc_state_owner();
if (new == NULL)
- return NULL;
+ goto out;
new->so_server = server;
new->so_cred = cred;
spin_lock(&clp->cl_lock);
@@ -511,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
rpc_destroy_wait_queue(&new->so_sequence.wait);
kfree(new);
}
+out:
+ nfs4_gc_state_owners(server);
return sp;
}
/**
* nfs4_put_state_owner - Release a nfs4_state_owner
* @sp: state owner data to release
- *
*/
void nfs4_put_state_owner(struct nfs4_state_owner *sp)
{
- struct nfs_client *clp = sp->so_server->nfs_client;
- struct rpc_cred *cred = sp->so_cred;
+ struct nfs_server *server = sp->so_server;
+ struct nfs_client *clp = server->nfs_client;
if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
return;
- nfs4_remove_state_owner_locked(sp);
+
+ if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+ sp->so_expires = jiffies;
+ list_add_tail(&sp->so_lru, &server->state_owners_lru);
+ spin_unlock(&clp->cl_lock);
+ } else {
+ nfs4_remove_state_owner_locked(sp);
+ spin_unlock(&clp->cl_lock);
+ nfs4_free_state_owner(sp);
+ }
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ *
+ * Called at umount time. Remaining state owners will be on
+ * the LRU with ref count of zero.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+ LIST_HEAD(doomed);
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ list_move(&sp->so_lru, &doomed);
+ nfs4_remove_state_owner_locked(sp);
+ }
spin_unlock(&clp->cl_lock);
- rpc_destroy_wait_queue(&sp->so_sequence.wait);
- put_rpccred(cred);
- kfree(sp);
+
+ list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
}
static struct nfs4_state *
@@ -1402,6 +1463,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
restart:
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ nfs4_purge_state_owners(server);
spin_lock(&clp->cl_lock);
for (pos = rb_first(&server->state_owners);
pos != NULL;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6161b213ed1..95e92e438407 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getfh(xdr, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr);
encode_restorefh(xdr, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_getfattr(xdr, args->dir_bitmask, &hdr);
encode_nops(&hdr);
}
@@ -2517,11 +2517,13 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
- replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
+ replen = hdr.replen + op_decode_hdr_maxsz + 1;
encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
args->acl_pages, args->acl_pgbase, args->acl_len);
+ xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
+
encode_nops(&hdr);
}
@@ -3790,7 +3792,8 @@ out_overflow:
}
static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
- const struct nfs_server *server, uint32_t *uid, int may_sleep)
+ const struct nfs_server *server, uint32_t *uid,
+ struct nfs4_string *owner_name)
{
uint32_t len;
__be32 *p;
@@ -3807,8 +3810,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
goto out_overflow;
- if (!may_sleep) {
- /* do nothing */
+ if (owner_name != NULL) {
+ owner_name->data = kmemdup(p, len, GFP_NOWAIT);
+ if (owner_name->data != NULL) {
+ owner_name->len = len;
+ ret = NFS_ATTR_FATTR_OWNER_NAME;
+ }
} else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
ret = NFS_ATTR_FATTR_OWNER;
@@ -3828,7 +3835,8 @@ out_overflow:
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
- const struct nfs_server *server, uint32_t *gid, int may_sleep)
+ const struct nfs_server *server, uint32_t *gid,
+ struct nfs4_string *group_name)
{
uint32_t len;
__be32 *p;
@@ -3845,8 +3853,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
goto out_overflow;
- if (!may_sleep) {
- /* do nothing */
+ if (group_name != NULL) {
+ group_name->data = kmemdup(p, len, GFP_NOWAIT);
+ if (group_name->data != NULL) {
+ group_name->len = len;
+ ret = NFS_ATTR_FATTR_GROUP_NAME;
+ }
} else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
ret = NFS_ATTR_FATTR_GROUP;
@@ -4283,7 +4295,7 @@ xdr_error:
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
- const struct nfs_server *server, int may_sleep)
+ const struct nfs_server *server)
{
int status;
umode_t fmode = 0;
@@ -4350,12 +4362,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
+ status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
+ status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -4396,7 +4408,7 @@ xdr_error:
}
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+ struct nfs_fh *fh, const struct nfs_server *server)
{
__be32 *savep;
uint32_t attrlen,
@@ -4415,7 +4427,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
if (status < 0)
goto xdr_error;
@@ -4426,9 +4438,9 @@ xdr_error:
}
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- const struct nfs_server *server, int may_sleep)
+ const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+ return decode_getfattr_generic(xdr, fattr, NULL, server);
}
/*
@@ -4957,17 +4969,18 @@ decode_restorefh(struct xdr_stream *xdr)
}
static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
- size_t *acl_len)
+ struct nfs_getaclres *res)
{
- __be32 *savep;
+ __be32 *savep, *bm_p;
uint32_t attrlen,
bitmap[3] = {0};
struct kvec *iov = req->rq_rcv_buf.head;
int status;
- *acl_len = 0;
+ res->acl_len = 0;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
goto out;
+ bm_p = xdr->p;
if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
goto out;
if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -4979,18 +4992,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
size_t hdrlen;
u32 recvd;
+ /* The bitmap (xdr len + bitmaps) and the attr xdr len words
+ * are stored with the acl data to handle the problem of
+ * variable length bitmaps.*/
+ xdr->p = bm_p;
+ res->acl_data_offset = be32_to_cpup(bm_p) + 2;
+ res->acl_data_offset <<= 2;
+
/* We ignore &savep and don't do consistency checks on
* the attr length. Let userspace figure it out.... */
hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
+ attrlen += res->acl_data_offset;
recvd = req->rq_rcv_buf.len - hdrlen;
if (attrlen > recvd) {
- dprintk("NFS: server cheating in getattr"
- " acl reply: attrlen %u > recvd %u\n",
+ if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
+ /* getxattr interface called with a NULL buf */
+ res->acl_len = attrlen;
+ goto out;
+ }
+ dprintk("NFS: acl reply: attrlen %u > recvd %u\n",
attrlen, recvd);
return -EINVAL;
}
xdr_read_pages(xdr, attrlen);
- *acl_len = attrlen;
+ res->acl_len = attrlen;
} else
status = -EOPNOTSUPP;
@@ -5696,8 +5721,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
status = decode_open_downgrade(xdr, res);
if (status != 0)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5723,8 +5747,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_access(xdr, res);
if (status != 0)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5753,8 +5776,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server
- ,!RPC_IS_ASYNC(rqstp->rq_task));
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5780,8 +5802,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
goto out;
status = decode_getfh(xdr, res->fh);
if (status == 0)
- status = decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5807,8 +5828,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_remove(xdr, &res->cinfo);
if (status)
goto out;
- decode_getfattr(xdr, res->dir_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->dir_attr, res->server);
out:
return status;
}
@@ -5841,14 +5861,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
if (status)
goto out;
/* Current FH is target directory */
- if (decode_getfattr(xdr, res->new_fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->new_fattr, res->server))
goto out;
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->old_fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->old_fattr, res->server);
out:
return status;
}
@@ -5884,14 +5902,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
* Note order: OP_LINK leaves the directory as the current
* filehandle.
*/
- if (decode_getfattr(xdr, res->dir_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->dir_attr, res->server))
goto out;
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5923,14 +5939,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- if (decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->fattr, res->server))
goto out;
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->dir_fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->dir_fattr, res->server);
out:
return status;
}
@@ -5962,8 +5976,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6028,7 +6041,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getacl(xdr, rqstp, &res->acl_len);
+ status = decode_getacl(xdr, rqstp, res);
out:
return status;
@@ -6061,8 +6074,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
* an ESTALE error. Shouldn't be a problem,
* though, since fattr->valid will remain unset.
*/
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6093,13 +6105,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
goto out;
if (decode_getfh(xdr, &res->fh) != 0)
goto out;
- if (decode_getfattr(xdr, res->f_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
goto out;
if (decode_restorefh(xdr) != 0)
goto out;
- decode_getfattr(xdr, res->dir_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->dir_attr, res->server);
out:
return status;
}
@@ -6147,8 +6157,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
status = decode_open(xdr, res);
if (status)
goto out;
- decode_getfattr(xdr, res->f_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->f_attr, res->server);
out:
return status;
}
@@ -6175,8 +6184,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
status = decode_setattr(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6356,8 +6364,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
if (status)
goto out;
if (res->fattr)
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
if (!status)
status = res->count;
out:
@@ -6386,8 +6393,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
if (status)
goto out;
if (res->fattr)
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6546,8 +6552,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
status = decode_delegreturn(xdr);
if (status != 0)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6576,8 +6581,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
goto out;
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr(xdr, &res->fs_locations->fattr,
- res->fs_locations->server,
- !RPC_IS_ASYNC(req->rq_task));
+ res->fs_locations->server);
out:
return status;
}
@@ -6826,8 +6830,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
status = decode_layoutcommit(xdr, rqstp, res);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6958,7 +6961,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
goto out_overflow;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- entry->server, 1) < 0)
+ entry->server) < 0)
goto out_overflow;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c807ab93140e..55d01280a609 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = {
static struct pnfs_layoutdriver_type objlayout_type = {
.id = LAYOUT_OSD2_OBJECTS,
.name = "LAYOUT_OSD2_OBJECTS",
- .flags = PNFS_LAYOUTRET_ON_SETATTR,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_LAYOUTRET_ON_ERROR,
.alloc_layout_hdr = objlayout_alloc_layout_hdr,
.free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 72074e3a04f9..b3c29039f5b8 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
oir->status = rdata->task.tk_status = status;
if (status >= 0)
rdata->res.count = status;
+ else
+ rdata->pnfs_error = status;
objlayout_iodone(oir);
/* must not use oir after this point */
@@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
if (status >= 0) {
wdata->res.count = status;
wdata->verf.committed = oir->committed;
+ } else {
+ wdata->pnfs_error = status;
}
objlayout_iodone(oir);
/* must not use oir after this point */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e672a2b2d69..17149a490065 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
+static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
+{
+ struct nfs_pageio_descriptor pgio;
+ LIST_HEAD(failed);
+
+ /* Resend all requests through the MDS */
+ nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
+ while (!list_empty(head)) {
+ struct nfs_page *req = nfs_list_entry(head->next);
+
+ nfs_list_remove_request(req);
+ if (!nfs_pageio_add_request(&pgio, req))
+ nfs_list_add_request(req, &failed);
+ }
+ nfs_pageio_complete(&pgio);
+
+ if (!list_empty(&failed)) {
+ /* For some reason our attempt to resend pages. Mark the
+ * overall send request as having failed, and let
+ * nfs_writeback_release_full deal with the error.
+ */
+ list_move(&failed, head);
+ return -EIO;
+ }
+ return 0;
+}
+
/*
* Called by non rpc-based layout drivers
*/
@@ -1175,9 +1202,17 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
pnfs_set_layoutcommit(data);
data->mds_ops->rpc_call_done(&data->task, data);
} else {
- put_lseg(data->lseg);
- data->lseg = NULL;
dprintk("pnfs write error = %d\n", data->pnfs_error);
+ if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR) {
+ /* Don't lo_commit on error, Server will needs to
+ * preform a file recovery.
+ */
+ clear_bit(NFS_INO_LAYOUTCOMMIT,
+ &NFS_I(data->inode)->flags);
+ pnfs_return_layout(data->inode);
+ }
+ data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
}
data->mds_ops->rpc_release(data);
}
@@ -1267,6 +1302,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
put_lseg(data->lseg);
data->lseg = NULL;
dprintk("pnfs write error = %d\n", data->pnfs_error);
+ if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR)
+ pnfs_return_layout(data->inode);
nfs_pageio_init_read_mds(&pgio, data->inode);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1509530cb111..53d593a0a4f2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,6 +68,7 @@ enum {
enum layoutdriver_policy_flags {
/* Should the pNFS client commit and return the layout upon a setattr */
PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+ PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
};
struct nfs4_deviceid_node;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e463967aafb8..3dfa4f112c0a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -908,10 +908,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
data->auth_flavor_len = 1;
data->version = version;
data->minorversion = 0;
+ security_init_mnt_opts(&data->lsm_opts);
}
return data;
}
+static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
+{
+ if (data) {
+ kfree(data->client_address);
+ kfree(data->mount_server.hostname);
+ kfree(data->nfs_server.export_path);
+ kfree(data->nfs_server.hostname);
+ kfree(data->fscache_uniq);
+ security_free_mnt_opts(&data->lsm_opts);
+ kfree(data);
+ }
+}
+
/*
* Sanity-check a server address provided by the mount command.
*
@@ -2219,9 +2233,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
mntfh = nfs_alloc_fhandle();
if (data == NULL || mntfh == NULL)
- goto out_free_fh;
-
- security_init_mnt_opts(&data->lsm_opts);
+ goto out;
/* Validate the mount data */
error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
@@ -2233,8 +2245,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
#ifdef CONFIG_NFS_V4
if (data->version == 4) {
mntroot = nfs4_try_mount(flags, dev_name, data);
- kfree(data->client_address);
- kfree(data->nfs_server.export_path);
goto out;
}
#endif /* CONFIG_NFS_V4 */
@@ -2289,13 +2299,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
s->s_flags |= MS_ACTIVE;
out:
- kfree(data->nfs_server.hostname);
- kfree(data->mount_server.hostname);
- kfree(data->fscache_uniq);
- security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
+ nfs_free_parsed_mount_data(data);
nfs_free_fhandle(mntfh);
- kfree(data);
return mntroot;
out_err_nosb:
@@ -2622,9 +2627,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
mntfh = nfs_alloc_fhandle();
if (data == NULL || mntfh == NULL)
- goto out_free_fh;
-
- security_init_mnt_opts(&data->lsm_opts);
+ goto out;
/* Get a volume representation */
server = nfs4_create_server(data, mntfh);
@@ -2676,13 +2679,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
s->s_flags |= MS_ACTIVE;
- security_free_mnt_opts(&data->lsm_opts);
nfs_free_fhandle(mntfh);
return mntroot;
out:
- security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
nfs_free_fhandle(mntfh);
return ERR_PTR(error);
@@ -2839,7 +2839,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
data = nfs_alloc_parsed_mount_data(4);
if (data == NULL)
- goto out_free_data;
+ goto out;
/* Validate the mount data */
error = nfs4_validate_mount_data(raw_data, data, dev_name);
@@ -2853,12 +2853,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
error = PTR_ERR(res);
out:
- kfree(data->client_address);
- kfree(data->nfs_server.export_path);
- kfree(data->nfs_server.hostname);
- kfree(data->fscache_uniq);
-out_free_data:
- kfree(data);
+ nfs_free_parsed_mount_data(data);
dprintk("<-- nfs4_mount() = %d%s\n", error,
error != 0 ? " [error]" : "");
return res;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1dda78db6a73..834f0fe96f89 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
.pg_doio = nfs_generic_pg_writepages,
};
-static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
struct inode *inode, int ioflags)
{
nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
@@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
static void nfs_writeback_release_full(void *calldata)
{
struct nfs_write_data *data = calldata;
- int ret, status = data->task.tk_status;
- struct nfs_pageio_descriptor pgio;
-
- if (data->pnfs_error) {
- nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
- pgio.pg_recoalesce = 1;
- }
+ int status = data->task.tk_status;
/* Update attributes as result of writeback. */
while (!list_empty(&data->pages)) {
@@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata)
req->wb_bytes,
(long long)req_offset(req));
- if (data->pnfs_error) {
- dprintk(", pnfs error = %d\n", data->pnfs_error);
- goto next;
- }
-
if (status < 0) {
nfs_set_pageerror(page);
nfs_context_set_write_error(req->wb_context, status);
@@ -1212,19 +1201,7 @@ remove_request:
next:
nfs_clear_page_tag_locked(req);
nfs_end_page_writeback(page);
- if (data->pnfs_error) {
- lock_page(page);
- nfs_pageio_cond_complete(&pgio, page->index);
- ret = nfs_page_async_flush(&pgio, page, 0);
- if (ret) {
- nfs_set_pageerror(page);
- dprintk("rewrite to MDS error = %d\n", ret);
- }
- unlock_page(page);
- }
}
- if (data->pnfs_error)
- nfs_pageio_complete(&pgio);
nfs_writedata_release(calldata);
}
@@ -1711,7 +1688,7 @@ out_error:
#ifdef CONFIG_MIGRATION
int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page)
+ struct page *page, enum migrate_mode mode)
{
/*
* If PagePrivate is set, then the page is currently associated with
@@ -1726,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
nfs_fscache_release_page(page, GFP_KERNEL);
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7748d6a18d97..6f3ebb48b12f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -718,7 +718,7 @@ int set_callback_cred(void)
{
if (callback_cred)
return 0;
- callback_cred = rpc_lookup_machine_cred();
+ callback_cred = rpc_lookup_machine_cred("nfs");
if (!callback_cred)
return -ENOMEM;
return 0;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a5ebe421195f..286edf1e231f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -827,8 +827,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
goto out;
}
- rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
- &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+ rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
+ NULL, NULL, NULL, &fsdlm);
if (rc) {
ocfs2_live_connection_drop(control);
goto out;
diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e64..a932ced92a16 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
if (nr_pages < pipe->nrbufs)
return -EBUSY;
- bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+ bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd0..9252ee3b71e3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
pid_nr_ns(pid, ns),
tcomm,
state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
task->policy,
(unsigned long long)delayacct_blkio_ticks(task),
cputime_to_clock_t(gtime),
- cputime_to_clock_t(cgtime));
+ cputime_to_clock_t(cgtime),
+ (mm && permitted) ? mm->start_data : 0,
+ (mm && permitted) ? mm->end_data : 0,
+ (mm && permitted) ? mm->start_brk : 0);
if (mm)
mmput(mm);
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1dddda999f2..5485a5388ecb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,9 +83,11 @@
#include <linux/pid_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
+#include <linux/flex_array.h>
#ifdef CONFIG_HARDWALL
#include <asm/hardwall.h>
#endif
+#include <trace/events/oom.h>
#include "internal.h"
/* NOTE:
@@ -133,6 +135,8 @@ struct pid_entry {
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+static int proc_fd_permission(struct inode *inode, int mask);
+
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
@@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
return result;
}
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
return result;
}
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -627,6 +631,52 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+ struct task_struct *task,
+ int hide_pid_min)
+{
+ if (pid->hide_pid < hide_pid_min)
+ return true;
+ if (in_group_p(pid->pid_gid))
+ return true;
+ return ptrace_may_access(task, PTRACE_MODE_READ);
+}
+
+
+static int proc_pid_permission(struct inode *inode, int mask)
+{
+ struct pid_namespace *pid = inode->i_sb->s_fs_info;
+ struct task_struct *task;
+ bool has_perms;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ has_perms = has_pid_permissions(pid, task, 1);
+ put_task_struct(task);
+
+ if (!has_perms) {
+ if (pid->hide_pid == 2) {
+ /*
+ * Let's make getdents(), stat(), and open()
+ * consistent with each other. If a process
+ * may not stat() a file, it shouldn't be seen
+ * in procfs at all.
+ */
+ return -ENOENT;
+ }
+
+ return -EPERM;
+ }
+ return generic_permission(inode, mask);
+}
+
+
+
static const struct inode_operations proc_def_inode_operations = {
.setattr = proc_setattr,
};
@@ -1010,6 +1060,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
else
task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-OOM_DISABLE;
+ trace_oom_score_adj_update(task);
err_sighand:
unlock_task_sighand(task, &flags);
err_task_lock:
@@ -1097,6 +1148,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
task->signal->oom_score_adj = oom_score_adj;
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
task->signal->oom_score_adj_min = oom_score_adj;
+ trace_oom_score_adj_update(task);
/*
* Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
* always attainable.
@@ -1453,13 +1505,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
.release = single_release,
};
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
struct task_struct *task;
struct mm_struct *mm;
struct file *exe_file;
- task = get_proc_task(inode);
+ task = get_proc_task(dentry->d_inode);
if (!task)
return -ENOENT;
mm = get_task_mm(task);
@@ -1489,7 +1541,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
out:
return ERR_PTR(error);
}
@@ -1528,7 +1580,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
if (error)
goto out;
@@ -1609,6 +1661,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
struct inode *inode = dentry->d_inode;
struct task_struct *task;
const struct cred *cred;
+ struct pid_namespace *pid = dentry->d_sb->s_fs_info;
generic_fillattr(inode, stat);
@@ -1617,6 +1670,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
stat->gid = 0;
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
+ if (!has_pid_permissions(pid, task, 2)) {
+ rcu_read_unlock();
+ /*
+ * This doesn't prevent learning whether PID exists,
+ * it only makes getattr() consistent with readdir().
+ */
+ return -ENOENT;
+ }
if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
task_dumpable(task)) {
cred = __task_cred(task);
@@ -1820,9 +1881,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
return -ENOENT;
}
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- return proc_fd_info(inode, path, NULL);
+ return proc_fd_info(dentry->d_inode, path, NULL);
}
static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -2043,6 +2104,355 @@ static const struct file_operations proc_fd_operations = {
.llseek = default_llseek,
};
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+ unsigned long *start, unsigned long *end)
+{
+ if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ bool exact_vma_exists = false;
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ const struct cred *cred;
+ struct inode *inode;
+ int status = 0;
+
+ if (nd && nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ status = -EACCES;
+ goto out_notask;
+ }
+
+ inode = dentry->d_inode;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out_notask;
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out;
+
+ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+ down_read(&mm->mmap_sem);
+ exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+ up_read(&mm->mmap_sem);
+ }
+
+ mmput(mm);
+
+ if (exact_vma_exists) {
+ if (task_dumpable(task)) {
+ rcu_read_lock();
+ cred = __task_cred(task);
+ inode->i_uid = cred->euid;
+ inode->i_gid = cred->egid;
+ rcu_read_unlock();
+ } else {
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ }
+ security_task_to_inode(task, inode);
+ status = 1;
+ }
+
+out:
+ put_task_struct(task);
+
+out_notask:
+ if (status <= 0)
+ d_drop(dentry);
+
+ return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+ .d_revalidate = map_files_d_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int rc;
+
+ rc = -ENOENT;
+ task = get_proc_task(dentry->d_inode);
+ if (!task)
+ goto out;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+ if (rc)
+ goto out_mmput;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (vma && vma->vm_file) {
+ *path = vma->vm_file->f_path;
+ path_get(path);
+ rc = 0;
+ }
+ up_read(&mm->mmap_sem);
+
+out_mmput:
+ mmput(mm);
+out:
+ return rc;
+}
+
+struct map_files_info {
+ struct file *file;
+ unsigned long len;
+ unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct file *file = ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ if (!file)
+ return ERR_PTR(-ENOENT);
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->op.proc_get_link = proc_map_files_get_link;
+
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+ inode->i_mode = S_IFLNK;
+
+ if (file->f_mode & FMODE_READ)
+ inode->i_mode |= S_IRUSR;
+ if (file->f_mode & FMODE_WRITE)
+ inode->i_mode |= S_IWUSR;
+
+ d_set_d_op(dentry, &tid_map_files_dentry_operations);
+ d_add(dentry, inode);
+
+ return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+ struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct dentry *result;
+ struct mm_struct *mm;
+
+ result = ERR_PTR(-EACCES);
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ result = ERR_PTR(-ENOENT);
+ task = get_proc_task(dir);
+ if (!task)
+ goto out;
+
+ result = ERR_PTR(-EACCES);
+ if (lock_trace(task))
+ goto out_put_task;
+
+ result = ERR_PTR(-ENOENT);
+ if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+ goto out_unlock;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_unlock;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (!vma)
+ goto out_no_vma;
+
+ result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out_unlock:
+ unlock_trace(task);
+out_put_task:
+ put_task_struct(task);
+out:
+ return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+ .lookup = proc_map_files_lookup,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ ino_t ino;
+ int ret;
+
+ ret = -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -ENOENT;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out;
+
+ ret = -EACCES;
+ if (lock_trace(task))
+ goto out_put_task;
+
+ ret = 0;
+ switch (filp->f_pos) {
+ case 0:
+ ino = inode->i_ino;
+ if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+ goto out_unlock;
+ filp->f_pos++;
+ case 1:
+ ino = parent_ino(dentry);
+ if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+ goto out_unlock;
+ filp->f_pos++;
+ default:
+ {
+ unsigned long nr_files, pos, i;
+ struct flex_array *fa = NULL;
+ struct map_files_info info;
+ struct map_files_info *p;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_unlock;
+ down_read(&mm->mmap_sem);
+
+ nr_files = 0;
+
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_sem taken
+ * 2) Release mmap_sem and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_sem taken in might_fault().
+ */
+
+ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (vma->vm_file && ++pos > filp->f_pos)
+ nr_files++;
+ }
+
+ if (nr_files) {
+ fa = flex_array_alloc(sizeof(info), nr_files,
+ GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, nr_files,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa)
+ flex_array_free(fa);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ goto out_unlock;
+ }
+ for (i = 0, vma = mm->mmap, pos = 2; vma;
+ vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= filp->f_pos)
+ continue;
+
+ get_file(vma->vm_file);
+ info.file = vma->vm_file;
+ info.len = snprintf(info.name,
+ sizeof(info.name), "%lx-%lx",
+ vma->vm_start, vma->vm_end);
+ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+ BUG();
+ }
+ }
+ up_read(&mm->mmap_sem);
+
+ for (i = 0; i < nr_files; i++) {
+ p = flex_array_get(fa, i);
+ ret = proc_fill_cache(filp, dirent, filldir,
+ p->name, p->len,
+ proc_map_files_instantiate,
+ task, p->file);
+ if (ret)
+ break;
+ filp->f_pos++;
+ fput(p->file);
+ }
+ for (; i < nr_files; i++) {
+ /*
+ * In case of error don't forget
+ * to put rest of file refs.
+ */
+ p = flex_array_get(fa, i);
+ fput(p->file);
+ }
+ if (fa)
+ flex_array_free(fa);
+ mmput(mm);
+ }
+ }
+
+out_unlock:
+ unlock_trace(task);
+out_put_task:
+ put_task_struct(task);
+out:
+ return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+ .read = generic_read_dir,
+ .readdir = proc_map_files_readdir,
+ .llseek = default_llseek,
+};
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
/*
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
@@ -2658,6 +3068,9 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
@@ -2761,6 +3174,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.lookup = proc_tgid_base_lookup,
.getattr = pid_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -2964,6 +3378,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
proc_pid_instantiate, iter.task, NULL);
}
+static int fake_filldir(void *buf, const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned d_type)
+{
+ return 0;
+}
+
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
@@ -2971,6 +3391,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
struct task_struct *reaper;
struct tgid_iter iter;
struct pid_namespace *ns;
+ filldir_t __filldir;
if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
goto out_no_task;
@@ -2992,8 +3413,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
+ if (has_pid_permissions(ns, iter.task, 2))
+ __filldir = filldir;
+ else
+ __filldir = fake_filldir;
+
filp->f_pos = iter.tgid + TGID_OFFSET;
- if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+ if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
put_task_struct(iter.task);
goto out;
}
@@ -3328,6 +3754,7 @@ static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 51a176622b8f..84fd3235a590 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,6 +7,7 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
+#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
@@ -17,7 +18,9 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/mount.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -101,12 +104,27 @@ void __init proc_init_inodecache(void)
init_once);
}
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct pid_namespace *pid = sb->s_fs_info;
+
+ if (pid->pid_gid)
+ seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+ if (pid->hide_pid != 0)
+ seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
+ return 0;
+}
+
static const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
+ .remount_fs = proc_remount,
+ .show_options = proc_show_options,
};
static void __pde_users_dec(struct proc_dir_entry *pde)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5cfec14..292577531ad1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde);
int proc_fill_super(struct super_block *);
struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+int proc_remount(struct super_block *sb, int *flags, char *data);
/*
* These are generic /proc routines that use the internal
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03102d978180..46a15d8a29ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
#include <linux/bitops.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
+#include <linux/parser.h>
#include "internal.h"
@@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data)
return err;
}
+enum {
+ Opt_gid, Opt_hidepid, Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_hidepid, "hidepid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_err, NULL},
+};
+
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ args[0].to = args[0].from = 0;
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ pid->pid_gid = option;
+ break;
+ case Opt_hidepid:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0 || option > 2) {
+ pr_err("proc: hidepid value must be between 0 and 2.\n");
+ return 0;
+ }
+ pid->hide_pid = option;
+ break;
+ default:
+ pr_err("proc: unrecognized mount option \"%s\" "
+ "or missing value\n", p);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct pid_namespace *pid = sb->s_fs_info;
+ return !proc_parse_options(data, pid);
+}
+
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
struct super_block *sb;
struct pid_namespace *ns;
struct proc_inode *ei;
+ char *options;
- if (flags & MS_KERNMOUNT)
+ if (flags & MS_KERNMOUNT) {
ns = (struct pid_namespace *)data;
- else
+ options = NULL;
+ } else {
ns = current->nsproxy->pid_ns;
+ options = data;
+ }
sb = sget(fs_type, proc_test_super, proc_set_super, ns);
if (IS_ERR(sb))
@@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
if (!sb->s_root) {
sb->s_flags = flags;
+ if (!proc_parse_options(options, ns)) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(-EINVAL);
+ }
err = proc_fill_super(sb);
if (err) {
deactivate_locked_super(sb);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index a945cd265228..70de42f09f1d 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
struct reiserfs_bitmap_info *bitmap;
unsigned int bmap_nr = reiserfs_bmap_count(sb);
- /* Avoid lock recursion in fault case */
- reiserfs_write_unlock(sb);
bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
- reiserfs_write_lock(sb);
if (bitmap == NULL)
return -ENOMEM;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index eb711060a6f2..c3cf54fd4de3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
char b[BDEVNAME_SIZE];
int ret;
- /*
- * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
- * dependency inversion warnings.
- */
- reiserfs_write_unlock(sb);
journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
if (!journal) {
reiserfs_warning(sb, "journal-1256",
"unable to get memory for journal structure");
- reiserfs_write_lock(sb);
return 1;
}
INIT_LIST_HEAD(&journal->j_bitmap_nodes);
@@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
INIT_LIST_HEAD(&journal->j_working_list);
INIT_LIST_HEAD(&journal->j_journal_list);
journal->j_persistent_trans = 0;
- ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
- reiserfs_bmap_count(sb));
- reiserfs_write_lock(sb);
- if (ret)
+ if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+ reiserfs_bmap_count(sb)))
goto free_and_return;
allocate_bitmap_nodes(sb);
@@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
goto free_and_return;
}
- /*
- * We need to unlock here to avoid creating the following
- * dependency:
- * reiserfs_lock -> sysfs_mutex
- * Because the reiserfs mmap path creates the following dependency:
- * mm->mmap -> reiserfs_lock, hence we have
- * mm->mmap -> reiserfs_lock ->sysfs_mutex
- * This would ends up in a circular dependency with sysfs readdir path
- * which does sysfs_mutex -> mm->mmap_sem
- * This is fine because the reiserfs lock is useless in mount path,
- * at least until we call journal_begin. We keep it for paranoid
- * reasons.
- */
- reiserfs_write_unlock(sb);
if (journal_init_dev(sb, journal, j_dev_name) != 0) {
- reiserfs_write_lock(sb);
reiserfs_warning(sb, "sh-462",
"unable to initialize jornal device");
goto free_and_return;
}
- reiserfs_write_lock(sb);
rs = SB_DISK_SUPER_BLOCK(sb);
@@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
journal->j_mount_id = 10;
journal->j_state = 0;
atomic_set(&(journal->j_jlock), 0);
- reiserfs_write_unlock(sb);
journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
- reiserfs_write_lock(sb);
journal->j_cnode_free_orig = journal->j_cnode_free_list;
journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
journal->j_cnode_used = 0;
@@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
init_journal_hash(sb);
jl = journal->j_current_jl;
+
+ /*
+ * get_list_bitmap() may call flush_commit_list() which
+ * requires the lock. Calling flush_commit_list() shouldn't happen
+ * this early but I like to be paranoid.
+ */
+ reiserfs_write_lock(sb);
jl->j_list_bitmap = get_list_bitmap(sb, jl);
+ reiserfs_write_unlock(sb);
if (!jl->j_list_bitmap) {
reiserfs_warning(sb, "journal-2005",
"get_list_bitmap failed for journal list 0");
goto free_and_return;
}
- if (journal_read(sb) < 0) {
+
+ /*
+ * Journal_read needs to be inspected in order to push down
+ * the lock further inside (or even remove it).
+ */
+ reiserfs_write_lock(sb);
+ ret = journal_read(sb);
+ reiserfs_write_unlock(sb);
+ if (ret < 0) {
reiserfs_warning(sb, "reiserfs-2006",
"Replay Failure, unable to mount");
goto free_and_return;
}
reiserfs_mounted_fs_count++;
- if (reiserfs_mounted_fs_count <= 1) {
- reiserfs_write_unlock(sb);
+ if (reiserfs_mounted_fs_count <= 1)
commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
- reiserfs_write_lock(sb);
- }
INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
journal->j_work_sb = sb;
@@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
journal->j_cnode_free < (journal->j_trans_max * 3)) {
return 1;
}
- /* protected by the BKL here */
+
journal->j_len_alloc += new_alloc;
th->t_blocks_allocated += new_alloc ;
return 0;
}
-/* this must be called inside a transaction, and requires the
-** kernel_lock to be held
+/* this must be called inside a transaction
*/
void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
{
@@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
return;
}
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
*/
void reiserfs_allow_writes(struct super_block *s)
{
@@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s)
wake_up(&journal->j_join_wait);
}
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
*/
void reiserfs_wait_on_write_block(struct super_block *s)
{
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1d42e707d5fa..e12d8b97cd4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1519,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset)
static int reread_meta_blocks(struct super_block *s)
{
ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
- reiserfs_write_unlock(s);
wait_on_buffer(SB_BUFFER_WITH_SB(s));
- reiserfs_write_lock(s);
if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
return 1;
@@ -1746,22 +1744,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
mutex_init(&REISERFS_SB(s)->lock);
REISERFS_SB(s)->lock_depth = -1;
- /*
- * This function is called with the bkl, which also was the old
- * locking used here.
- * do_journal_begin() will soon check if we hold the lock (ie: was the
- * bkl). This is likely because do_journal_begin() has several another
- * callers because at this time, it doesn't seem to be necessary to
- * protect against anything.
- * Anyway, let's be conservative and lock for now.
- */
- reiserfs_write_lock(s);
-
jdev_name = NULL;
if (reiserfs_parse_options
(s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
&commit_max_age, qf_names, &qfmt) == 0) {
- goto error;
+ goto error_unlocked;
}
if (jdev_name && jdev_name[0]) {
REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
@@ -1777,7 +1764,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (blocks) {
SWARN(silent, s, "jmacd-7", "resize option for remount only");
- goto error;
+ goto error_unlocked;
}
/* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
@@ -1787,7 +1774,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
reiserfs_bdevname(s));
- goto error;
+ goto error_unlocked;
}
rs = SB_DISK_SUPER_BLOCK(s);
@@ -1803,7 +1790,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
"or increase size of your LVM partition");
SWARN(silent, s, "", "Or may be you forgot to "
"reboot after fdisk when it told you to");
- goto error;
+ goto error_unlocked;
}
sbi->s_mount_state = SB_REISERFS_STATE(s);
@@ -1811,8 +1798,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if ((errval = reiserfs_init_bitmap_cache(s))) {
SWARN(silent, s, "jmacd-8", "unable to read bitmap");
- goto error;
+ goto error_unlocked;
}
+
errval = -EINVAL;
#ifdef CONFIG_REISERFS_CHECK
SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
@@ -1835,24 +1823,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (reiserfs_barrier_flush(s)) {
printk("reiserfs: using flush barriers\n");
}
+
// set_device_ro(s->s_dev, 1) ;
if (journal_init(s, jdev_name, old_format, commit_max_age)) {
SWARN(silent, s, "sh-2022",
"unable to initialize journal space");
- goto error;
+ goto error_unlocked;
} else {
jinit_done = 1; /* once this is set, journal_release must be called
** if we error out of the mount
*/
}
+
if (reread_meta_blocks(s)) {
SWARN(silent, s, "jmacd-9",
"unable to reread meta blocks after journal init");
- goto error;
+ goto error_unlocked;
}
if (replay_only(s))
- goto error;
+ goto error_unlocked;
if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
SWARN(silent, s, "clm-7000",
@@ -1866,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
reiserfs_init_locked_inode, (void *)(&args));
if (!root_inode) {
SWARN(silent, s, "jmacd-10", "get root inode failed");
- goto error;
+ goto error_unlocked;
}
+ /*
+ * This path assumed to be called with the BKL in the old times.
+ * Now we have inherited the big reiserfs lock from it and many
+ * reiserfs helpers called in the mount path and elsewhere require
+ * this lock to be held even if it's not always necessary. Let's be
+ * conservative and hold it early. The window can be reduced after
+ * careful review of the code.
+ */
+ reiserfs_write_lock(s);
+
if (root_inode->i_state & I_NEW) {
reiserfs_read_locked_inode(root_inode, &args);
unlock_new_inode(root_inode);
@@ -1995,12 +1995,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
return (0);
error:
- if (jinit_done) { /* kill the commit thread, free journal ram */
+ reiserfs_write_unlock(s);
+
+error_unlocked:
+ /* kill the commit thread, free journal ram */
+ if (jinit_done) {
+ reiserfs_write_lock(s);
journal_release_error(NULL, s);
+ reiserfs_write_unlock(s);
}
- reiserfs_write_unlock(s);
-
reiserfs_free_bitmap_cache(s);
if (SB_BUFFER_WITH_SB(s))
brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f744be98cd5a..af0b73802592 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
spin_lock(&cache->lock);
while (1) {
- for (i = 0; i < cache->entries; i++)
- if (cache->entry[i].block == block)
+ for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
+ if (cache->entry[i].block == block) {
+ cache->curr_blk = i;
break;
+ }
+ i = (i + 1) % cache->entries;
+ }
- if (i == cache->entries) {
+ if (n == cache->entries) {
/*
* Block not in cache, if all cache entries are used
* go to sleep waiting for one to become available.
@@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
goto cleanup;
}
+ cache->curr_blk = 0;
cache->next_blk = 0;
cache->unused = entries;
cache->entries = entries;
@@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
u64 *block, int *offset, int length)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
- int bytes, copied = length;
+ int bytes, res = length;
struct squashfs_cache_entry *entry;
TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
while (length) {
entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
- if (entry->error)
- return entry->error;
- else if (*offset >= entry->length)
- return -EIO;
+ if (entry->error) {
+ res = entry->error;
+ goto error;
+ } else if (*offset >= entry->length) {
+ res = -EIO;
+ goto error;
+ }
bytes = squashfs_copy_data(buffer, entry, *offset, length);
if (buffer)
@@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
squashfs_cache_put(entry);
}
- return copied;
+ return res;
+
+error:
+ squashfs_cache_put(entry);
+ return res;
}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index fd7b3b3bda13..81afbccfa843 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_op = &squashfs_inode_ops;
inode->i_fop = &generic_ro_fops;
inode->i_mode |= S_IFREG;
- inode->i_blocks = ((inode->i_size -
- le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+ inode->i_blocks = (inode->i_size -
+ le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
squashfs_i(inode)->fragment_block = frag_blk;
squashfs_i(inode)->fragment_size = frag_size;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 651f0b31d296..52934a22f296 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -28,6 +28,7 @@
struct squashfs_cache {
char *name;
int entries;
+ int curr_blk;
int next_blk;
int num_waiters;
int unused;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index d0858c2d9a47..ecaa2f7bdb8f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -290,7 +290,7 @@ handle_fragments:
check_directory_table:
/* Sanity check directory_table */
- if (msblk->directory_table >= next_table) {
+ if (msblk->directory_table > next_table) {
err = -EINVAL;
goto failed_mount;
}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b09ba2dd8b62..f922cbacdb96 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -38,9 +38,6 @@
DEFINE_SPINLOCK(dbg_lock);
-static char dbg_key_buf0[128];
-static char dbg_key_buf1[128];
-
static const char *get_key_fmt(int fmt)
{
switch (fmt) {
@@ -103,8 +100,8 @@ static const char *get_dent_type(int type)
}
}
-static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
- char *buffer)
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer, int len)
{
char *p = buffer;
int type = key_type(c, key);
@@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
switch (type) {
case UBIFS_INO_KEY:
- sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
- get_key_type(type));
+ len -= snprintf(p, len, "(%lu, %s)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type));
break;
case UBIFS_DENT_KEY:
case UBIFS_XENT_KEY:
- sprintf(p, "(%lu, %s, %#08x)",
- (unsigned long)key_inum(c, key),
- get_key_type(type), key_hash(c, key));
+ len -= snprintf(p, len, "(%lu, %s, %#08x)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type), key_hash(c, key));
break;
case UBIFS_DATA_KEY:
- sprintf(p, "(%lu, %s, %u)",
- (unsigned long)key_inum(c, key),
- get_key_type(type), key_block(c, key));
+ len -= snprintf(p, len, "(%lu, %s, %u)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type), key_block(c, key));
break;
case UBIFS_TRUN_KEY:
- sprintf(p, "(%lu, %s)",
- (unsigned long)key_inum(c, key),
- get_key_type(type));
+ len -= snprintf(p, len, "(%lu, %s)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type));
break;
default:
- sprintf(p, "(bad key type: %#08x, %#08x)",
- key->u32[0], key->u32[1]);
+ len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
+ key->u32[0], key->u32[1]);
}
} else
- sprintf(p, "bad key format %d", c->key_fmt);
-}
-
-const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
-{
- /* dbg_lock must be held */
- sprintf_key(c, key, dbg_key_buf0);
- return dbg_key_buf0;
-}
-
-const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
-{
- /* dbg_lock must be held */
- sprintf_key(c, key, dbg_key_buf1);
- return dbg_key_buf1;
+ len -= snprintf(p, len, "bad key format %d", c->key_fmt);
+ ubifs_assert(len > 0);
+ return p;
}
const char *dbg_ntype(int type)
@@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int i, n;
union ubifs_key key;
const struct ubifs_ch *ch = node;
+ char key_buf[DBG_KEY_BUF_LEN];
if (dbg_is_tst_rcvry(c))
return;
@@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
const struct ubifs_ino_node *ino = node;
key_read(c, &ino->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
(unsigned long long)le64_to_cpu(ino->creat_sqnum));
printk(KERN_DEBUG "\tsize %llu\n",
@@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int nlen = le16_to_cpu(dent->nlen);
key_read(c, &dent->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
printk(KERN_DEBUG "\tinum %llu\n",
(unsigned long long)le64_to_cpu(dent->inum));
printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
@@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
key_read(c, &dn->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
printk(KERN_DEBUG "\tsize %u\n",
le32_to_cpu(dn->size));
printk(KERN_DEBUG "\tcompr_typ %d\n",
@@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
key_read(c, &br->key, &key);
printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
- le32_to_cpu(br->len), DBGKEY(&key));
+ le32_to_cpu(br->len),
+ dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
}
break;
}
@@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
{
int n;
const struct ubifs_zbranch *zbr;
+ char key_buf[DBG_KEY_BUF_LEN];
spin_lock(&dbg_lock);
if (znode->parent)
@@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c,
printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
"%s\n", n, zbr->znode, zbr->lnum,
zbr->offs, zbr->len,
- DBGKEY(&zbr->key));
+ dbg_snprintf_key(c, &zbr->key,
+ key_buf,
+ DBG_KEY_BUF_LEN));
else
printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
"%s\n", n, zbr->znode, zbr->lnum,
zbr->offs, zbr->len,
- DBGKEY(&zbr->key));
+ dbg_snprintf_key(c, &zbr->key,
+ key_buf,
+ DBG_KEY_BUF_LEN));
}
spin_unlock(&dbg_lock);
}
@@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
int err, nlen1, nlen2, cmp;
struct ubifs_dent_node *dent1, *dent2;
union ubifs_key key;
+ char key_buf[DBG_KEY_BUF_LEN];
ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
key_read(c, &dent1->key, &key);
if (keys_cmp(c, &zbr1->key, &key)) {
dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, DBGKEY(&key));
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_err("but it should have key %s according to tnc",
- DBGKEY(&zbr1->key));
+ dbg_snprintf_key(c, &zbr1->key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_dump_node(c, dent1);
goto out_free;
}
@@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
key_read(c, &dent2->key, &key);
if (keys_cmp(c, &zbr2->key, &key)) {
dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, DBGKEY(&key));
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_err("but it should have key %s according to tnc",
- DBGKEY(&zbr2->key));
+ dbg_snprintf_key(c, &zbr2->key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_dump_node(c, dent2);
goto out_free;
}
@@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
dbg_err("2 xent/dent nodes with the same name");
else
dbg_err("bad order of colliding key %s",
- DBGKEY(&key));
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
dbg_dump_node(c, dent1);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8d9c46810189..307ab1d23f75 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -169,40 +169,39 @@ struct ubifs_global_debug_info {
spin_unlock(&dbg_lock); \
} while (0)
-const char *dbg_key_str0(const struct ubifs_info *c,
- const union ubifs_key *key);
-const char *dbg_key_str1(const struct ubifs_info *c,
- const union ubifs_key *key);
-
-/*
- * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
- * macros.
- */
-#define DBGKEY(key) dbg_key_str0(c, (key))
-#define DBGKEY1(key) dbg_key_str1(c, (key))
-
-extern spinlock_t dbg_lock;
-
-#define ubifs_dbg_msg(type, fmt, ...) do { \
- spin_lock(&dbg_lock); \
- pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
- spin_unlock(&dbg_lock); \
+#define ubifs_dbg_msg(type, fmt, ...) \
+ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
+
+#define DBG_KEY_BUF_LEN 32
+#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \
+ char __tmp_key_buf[DBG_KEY_BUF_LEN]; \
+ pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \
+ dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \
} while (0)
/* Just a debugging messages not related to any specific UBIFS subsystem */
-#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...) \
+ printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
+ __func__, ##__VA_ARGS__)
+
/* General messages */
#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
/* Additional journal messages */
#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) \
+ ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
/* Additional TNC messages */
#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) \
+ ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
/* Additional lprops messages */
#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
/* Additional LEB find messages */
#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
/* Additional mount messages */
#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) \
+ ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
/* Additional I/O messages */
#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
/* Additional commit messages */
@@ -218,6 +217,7 @@ extern spinlock_t dbg_lock;
/* Additional recovery messages */
#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
+extern spinlock_t dbg_lock;
extern struct ubifs_global_debug_info ubifs_dbg;
static inline int dbg_is_chk_gen(const struct ubifs_info *c)
@@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state);
const char *dbg_jhead(int jhead);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer, int len);
void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
void dbg_dump_node(const struct ubifs_info *c, const void *node);
void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
@@ -368,6 +370,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; }
static inline const char *
dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key) { return ""; }
+static inline const char *
+dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer,
+ int len) { return ""; }
static inline void dbg_dump_inode(struct ubifs_info *c,
const struct inode *inode) { return; }
static inline void dbg_dump_node(const struct ubifs_info *c,
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index cef0460f4c54..2f438ab2e7a2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
struct ubifs_inode *ui = ubifs_inode(inode);
- dbg_jnl("ino %lu, blk %u, len %d, key %s",
- (unsigned long)key_inum(c, key), key_block(c, key), len,
- DBGKEY(key));
+ dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
+ (unsigned long)key_inum(c, key), key_block(c, key), len);
ubifs_assert(len <= UBIFS_BLOCK_SIZE);
data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
blk = new_size >> UBIFS_BLOCK_SHIFT;
data_key_init(c, &key, inum, blk);
- dbg_jnl("last block key %s", DBGKEY(&key));
+ dbg_jnlk(&key, "last block key ");
err = ubifs_tnc_lookup(c, &key, dn);
if (err == -ENOENT)
dlen = 0; /* Not found (so it is a hole) */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 6189c74d97f0..66d59d0a1402 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1986,12 +1986,11 @@ again:
if (path[h].in_tree)
continue;
- nnode = kmalloc(sz, GFP_NOFS);
+ nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS);
if (!nnode) {
err = -ENOMEM;
goto out;
}
- memcpy(nnode, &path[h].nnode, sz);
parent = nnode->parent;
parent->nbranch[nnode->iip].nnode = nnode;
path[h].ptr.nnode = nnode;
@@ -2004,12 +2003,11 @@ again:
const size_t sz = sizeof(struct ubifs_pnode);
struct ubifs_nnode *parent;
- pnode = kmalloc(sz, GFP_NOFS);
+ pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS);
if (!pnode) {
err = -ENOMEM;
goto out;
}
- memcpy(pnode, &path[h].pnode, sz);
parent = pnode->parent;
parent->nbranch[pnode->iip].pnode = pnode;
path[h].ptr.pnode = pnode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ccabaf1164b3..b007637f0406 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
{
int err;
- dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
- r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
+ dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
+ r->lnum, r->offs, r->len, r->deletion, r->sqnum);
/* Set c->replay_sqnum to help deal with dangling branches. */
c->replay_sqnum = r->sqnum;
@@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
{
struct replay_entry *r;
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
@@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
struct replay_entry *r;
char *nbuf;
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 066738647685..16ad84d8402f 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -344,12 +344,11 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
return err;
}
- lnc_node = kmalloc(zbr->len, GFP_NOFS);
+ lnc_node = kmemdup(node, zbr->len, GFP_NOFS);
if (!lnc_node)
/* We don't have to have the cache, so no error */
return 0;
- memcpy(lnc_node, node, zbr->len);
zbr->leaf = lnc_node;
return 0;
}
@@ -506,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
{
int ret;
- dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+ dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
zbr->offs);
@@ -520,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
ret = 0;
}
if (ret == 0 && c->replaying)
- dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
- zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+ dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
+ zbr->lnum, zbr->offs, zbr->len);
return ret;
}
@@ -996,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c,
if (adding || !o_znode)
return 0;
- dbg_mnt("dangling match LEB %d:%d len %d %s",
+ dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
- o_znode->zbranch[o_n].len, DBGKEY(key));
+ o_znode->zbranch[o_n].len);
*zn = o_znode;
*n = o_n;
return 1;
@@ -1180,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
unsigned long time = get_seconds();
- dbg_tnc("search key %s", DBGKEY(key));
+ dbg_tnck(key, "search key ");
ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
znode = c->zroot.znode;
@@ -1316,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
unsigned long time = get_seconds();
- dbg_tnc("search and dirty key %s", DBGKEY(key));
+ dbg_tnck(key, "search and dirty key ");
znode = c->zroot.znode;
if (unlikely(!znode)) {
@@ -1723,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
if (!keys_eq(c, &zbr->key, &key1)) {
ubifs_err("bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
- dbg_tnc("looked for key %s found node's key %s",
- DBGKEY(&zbr->key), DBGKEY1(&key1));
+ dbg_tnck(&zbr->key, "looked for key ");
+ dbg_tnck(&key1, "found node's key ");
goto out_err;
}
@@ -1777,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
ubifs_err("failed to read from LEB %d:%d, error %d",
lnum, offs, err);
dbg_dump_stack();
- dbg_tnc("key %s", DBGKEY(&bu->key));
+ dbg_tnck(&bu->key, "key ");
return err;
}
@@ -1812,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
int found, n, err;
struct ubifs_znode *znode;
- dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+ dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
mutex_lock(&c->tnc_mutex);
found = ubifs_lookup_level0(c, key, &znode, &n);
if (!found) {
@@ -1986,8 +1985,7 @@ again:
zp = znode->parent;
if (znode->child_cnt < c->fanout) {
ubifs_assert(n != c->fanout);
- dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
- DBGKEY(key));
+ dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
insert_zbranch(znode, zbr, n);
@@ -2002,7 +2000,7 @@ again:
* Unfortunately, @znode does not have more empty slots and we have to
* split it.
*/
- dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+ dbg_tnck(key, "splitting level %d, key ", znode->level);
if (znode->alt)
/*
@@ -2096,7 +2094,7 @@ do_split:
}
/* Insert new key and branch */
- dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+ dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
insert_zbranch(zi, zbr, n);
@@ -2172,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+ dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
found = lookup_level0_dirty(c, key, &znode, &n);
if (!found) {
struct ubifs_zbranch zbr;
@@ -2221,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
- old_offs, lnum, offs, len, DBGKEY(key));
+ dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
+ old_offs, lnum, offs, len);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2304,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
- DBGKEY(key));
+ dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+ lnum, offs, nm->len, nm->name);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2398,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
/* Delete without merge for now */
ubifs_assert(znode->level == 0);
ubifs_assert(n >= 0 && n < c->fanout);
- dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+ dbg_tnck(&znode->zbranch[n].key, "deleting key ");
zbr = &znode->zbranch[n];
lnc_free(zbr);
@@ -2508,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("key %s", DBGKEY(key));
+ dbg_tnck(key, "key ");
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2539,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+ dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
err = lookup_level0_dirty(c, key, &znode, &n);
if (err < 0)
goto out_unlock;
@@ -2654,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
dbg_dump_znode(c, znode);
goto out_unlock;
}
- dbg_tnc("removing %s", DBGKEY(key));
+ dbg_tnck(key, "removing key ");
}
if (k) {
for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2774,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
struct ubifs_zbranch *zbr;
union ubifs_key *dkey;
- dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+ dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
ubifs_assert(is_hash_key(c, key));
mutex_lock(&c->tnc_mutex);
@@ -3333,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
out_dump:
block = key_block(c, key);
- ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
- "(data key %s)", (unsigned long)inode->i_ino, size,
- ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+ ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
+ (unsigned long)inode->i_ino, size,
+ ((loff_t)block) << UBIFS_BLOCK_SHIFT);
mutex_unlock(&c->tnc_mutex);
dbg_dump_inode(c, inode);
dbg_dump_stack();
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index b48db999903e..dc28fe6ec07a 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
case UBIFS_XENT_KEY:
break;
default:
- dbg_msg("bad key type at slot %d: %s", i,
- DBGKEY(&zbr->key));
+ dbg_msg("bad key type at slot %d: %d",
+ i, key_type(c, &zbr->key));
err = 3;
goto out_dump;
}
@@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
zbr->offs);
if (err) {
- dbg_tnc("key %s", DBGKEY(key));
+ dbg_tnck(key, "key ");
return err;
}
@@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
if (!keys_eq(c, key, &key1)) {
ubifs_err("bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
- dbg_tnc("looked for key %s found node's key %s",
- DBGKEY(key), DBGKEY1(&key1));
+ dbg_tnck(key, "looked for key ");
+ dbg_tnck(&key1, "but found node's key ");
dbg_dump_node(c, node);
return -EINVAL;
}
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index bf18f7a04544..85b272268754 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -138,12 +138,11 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui = ubifs_inode(inode);
ui->xattr = 1;
ui->flags |= UBIFS_XATTR_FL;
- ui->data = kmalloc(size, GFP_NOFS);
+ ui->data = kmemdup(value, size, GFP_NOFS);
if (!ui->data) {
err = -ENOMEM;
goto out_free;
}
- memcpy(ui->data, value, size);
inode->i_size = ui->ui_size = size;
ui->data_len = size;
@@ -204,12 +203,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
return err;
kfree(ui->data);
- ui->data = kmalloc(size, GFP_NOFS);
+ ui->data = kmemdup(value, size, GFP_NOFS);
if (!ui->data) {
err = -ENOMEM;
goto out_free;
}
- memcpy(ui->data, value, size);
inode->i_size = ui->ui_size = size;
ui->data_len = size;
OpenPOWER on IntegriCloud