summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_dir.c72
-rw-r--r--fs/adfs/dir.c42
-rw-r--r--fs/affs/dir.c69
-rw-r--r--fs/afs/dir.c99
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/aio.c43
-rw-r--r--fs/autofs4/root.c4
-rw-r--r--fs/bad_inode.c4
-rw-r--r--fs/befs/linuxvfs.c40
-rw-r--r--fs/bfs/dir.c35
-rw-r--r--fs/btrfs/backref.c3
-rw-r--r--fs/btrfs/check-integrity.c2
-rw-r--r--fs/btrfs/ctree.c4
-rw-r--r--fs/btrfs/ctree.h8
-rw-r--r--fs/btrfs/delayed-inode.c9
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/dev-replace.c5
-rw-r--r--fs/btrfs/disk-io.c65
-rw-r--r--fs/btrfs/extent-tree.c94
-rw-r--r--fs/btrfs/extent_io.c140
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/free-space-cache.c43
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c8
-rw-r--r--fs/btrfs/inode.c136
-rw-r--r--fs/btrfs/ioctl.c10
-rw-r--r--fs/btrfs/raid56.c2
-rw-r--r--fs/btrfs/relocation.c16
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/volumes.c54
-rw-r--r--fs/btrfs/volumes.h20
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/cachefiles/interface.c13
-rw-r--r--fs/cachefiles/namei.c10
-rw-r--r--fs/cachefiles/xattr.c6
-rw-r--r--fs/ceph/addr.c15
-rw-r--r--fs/ceph/dir.c99
-rw-r--r--fs/ceph/locks.c73
-rw-r--r--fs/ceph/mds_client.c65
-rw-r--r--fs/ceph/super.h9
-rw-r--r--fs/cifs/cifs_dfs_ref.c141
-rw-r--r--fs/cifs/cifsfs.c5
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/connect.c27
-rw-r--r--fs/cifs/dns_resolve.c4
-rw-r--r--fs/cifs/file.c5
-rw-r--r--fs/cifs/inode.c3
-rw-r--r--fs/cifs/readdir.c178
-rw-r--r--fs/coda/dir.c66
-rw-r--r--fs/compat.c43
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/dir.c122
-rw-r--r--fs/cramfs/inode.c21
-rw-r--r--fs/dcache.c11
-rw-r--r--fs/debugfs/file.c43
-rw-r--r--fs/dlm/config.c5
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c177
-rw-r--r--fs/ecryptfs/file.c43
-rw-r--r--fs/efivarfs/file.c14
-rw-r--r--fs/efs/dir.c75
-rw-r--r--fs/exec.c16
-rw-r--r--fs/exofs/dir.c38
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/exportfs/expfs.c14
-rw-r--r--fs/ext2/dir.c27
-rw-r--r--fs/ext3/dir.c157
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext3/namei.c7
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c158
-rw-r--r--fs/ext4/ext4.h181
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c202
-rw-r--r--fs/ext4/extents_status.c92
-rw-r--r--fs/ext4/extents_status.h8
-rw-r--r--fs/ext4/file.c18
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c168
-rw-r--r--fs/ext4/inode.c1682
-rw-r--r--fs/ext4/mballoc.c27
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/page-io.c226
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c99
-rw-r--r--fs/f2fs/data.c71
-rw-r--r--fs/f2fs/debug.c4
-rw-r--r--fs/f2fs/dir.c145
-rw-r--r--fs/f2fs/f2fs.h66
-rw-r--r--fs/f2fs/file.c58
-rw-r--r--fs/f2fs/gc.c42
-rw-r--r--fs/f2fs/inode.c13
-rw-r--r--fs/f2fs/namei.c17
-rw-r--r--fs/f2fs/node.c37
-rw-r--r--fs/f2fs/node.h68
-rw-r--r--fs/f2fs/recovery.c150
-rw-r--r--fs/f2fs/segment.c101
-rw-r--r--fs/f2fs/super.c253
-rw-r--r--fs/f2fs/xattr.c68
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fat/dir.c104
-rw-r--r--fs/fat/inode.c15
-rw-r--r--fs/file_table.c19
-rw-r--r--fs/freevxfs/vxfs_lookup.c55
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/fscache/cache.c34
-rw-r--r--fs/fscache/cookie.c93
-rw-r--r--fs/fscache/fsdef.c1
-rw-r--r--fs/fscache/internal.h11
-rw-r--r--fs/fscache/main.c11
-rw-r--r--fs/fscache/netfs.c1
-rw-r--r--fs/fscache/object-list.c103
-rw-r--r--fs/fscache/object.c1106
-rw-r--r--fs/fscache/operation.c37
-rw-r--r--fs/fscache/page.c65
-rw-r--r--fs/fuse/dir.c49
-rw-r--r--fs/fuse/file.c62
-rw-r--r--fs/fuse/inode.c7
-rw-r--r--fs/gfs2/Kconfig5
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c21
-rw-r--r--fs/gfs2/dir.c123
-rw-r--r--fs/gfs2/dir.h7
-rw-r--r--fs/gfs2/export.c10
-rw-r--r--fs/gfs2/file.c111
-rw-r--r--fs/gfs2/glops.c8
-rw-r--r--fs/gfs2/inode.c151
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/log.c78
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c28
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c11
-rw-r--r--fs/gfs2/rgrp.c27
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/gfs2/trans.c9
-rw-r--r--fs/hfs/bnode.c6
-rw-r--r--fs/hfs/dir.c49
-rw-r--r--fs/hfsplus/dir.c50
-rw-r--r--fs/hostfs/hostfs_kern.c13
-rw-r--r--fs/hpfs/dir.c66
-rw-r--r--fs/hpfs/file.c4
-rw-r--r--fs/hppfs/hppfs.c33
-rw-r--r--fs/internal.h6
-rw-r--r--fs/isofs/dir.c42
-rw-r--r--fs/jbd/transaction.c19
-rw-r--r--fs/jbd2/Kconfig6
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c184
-rw-r--r--fs/jbd2/journal.c166
-rw-r--r--fs/jbd2/recovery.c11
-rw-r--r--fs/jbd2/revoke.c49
-rw-r--r--fs/jbd2/transaction.c526
-rw-r--r--fs/jffs2/dir.c52
-rw-r--r--fs/jfs/jfs_dtree.c63
-rw-r--r--fs/jfs/jfs_dtree.h2
-rw-r--r--fs/jfs/jfs_logmgr.c8
-rw-r--r--fs/jfs/jfs_metapage.c5
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c38
-rw-r--r--fs/libfs.c80
-rw-r--r--fs/logfs/dir.c49
-rw-r--r--fs/logfs/file.c3
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/minix/dir.c42
-rw-r--r--fs/namei.c4
-rw-r--r--fs/ncpfs/dir.c87
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/dir.c51
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/nfs4session.c4
-rw-r--r--fs/nfs/nfs4session.h13
-rw-r--r--fs/nfs/nfs4state.c15
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/nfs4recover.c20
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nilfs2/dir.c48
-rw-r--r--fs/nilfs2/inode.c27
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/dir.c84
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/dir.c151
-rw-r--r--fs/ocfs2/dir.h5
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/extent_map.c2
-rw-r--r--fs/ocfs2/file.c6
-rw-r--r--fs/ocfs2/journal.c14
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/omfs/dir.c94
-rw-r--r--fs/openpromfs/inode.c95
-rw-r--r--fs/pnode.c3
-rw-r--r--fs/proc/base.c364
-rw-r--r--fs/proc/fd.c96
-rw-r--r--fs/proc/generic.c100
-rw-r--r--fs/proc/internal.h8
-rw-r--r--fs/proc/kmsg.c10
-rw-r--r--fs/proc/namespaces.c74
-rw-r--r--fs/proc/proc_net.c9
-rw-r--r--fs/proc/proc_sysctl.c71
-rw-r--r--fs/proc/root.c19
-rw-r--r--fs/qnx4/dir.c66
-rw-r--r--fs/qnx6/dir.c31
-rw-r--r--fs/read_write.c24
-rw-r--r--fs/readdir.c56
-rw-r--r--fs/reiserfs/dir.c38
-rw-r--r--fs/reiserfs/inode.c21
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/xattr.c47
-rw-r--r--fs/reiserfs/xattr_acl.c3
-rw-r--r--fs/romfs/super.c21
-rw-r--r--fs/splice.c32
-rw-r--r--fs/squashfs/dir.c40
-rw-r--r--fs/sysfs/dir.c68
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/sysfs/inode.c2
-rw-r--r--fs/sysv/dir.c37
-rw-r--r--fs/ubifs/dir.c69
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/udf/dir.c63
-rw-r--r--fs/ufs/dir.c28
-rw-r--r--fs/xfs/xfs_acl.c31
-rw-r--r--fs/xfs/xfs_acl.h31
-rw-r--r--fs/xfs/xfs_aops.c33
-rw-r--r--fs/xfs/xfs_attr_leaf.c98
-rw-r--r--fs/xfs/xfs_attr_leaf.h1
-rw-r--r--fs/xfs/xfs_attr_remote.c408
-rw-r--r--fs/xfs/xfs_attr_remote.h10
-rw-r--r--fs/xfs/xfs_btree.c10
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--fs/xfs/xfs_buf_item.c7
-rw-r--r--fs/xfs/xfs_da_btree.c7
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c17
-rw-r--r--fs/xfs/xfs_dir2_format.h4
-rw-r--r--fs/xfs/xfs_dir2_leaf.c20
-rw-r--r--fs/xfs/xfs_dir2_node.c13
-rw-r--r--fs/xfs/xfs_dir2_priv.h11
-rw-r--r--fs/xfs/xfs_dir2_sf.c31
-rw-r--r--fs/xfs/xfs_dquot.c37
-rw-r--r--fs/xfs/xfs_extfree_item.c5
-rw-r--r--fs/xfs/xfs_file.c12
-rw-r--r--fs/xfs/xfs_fs.h1
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_inode.c16
-rw-r--r--fs/xfs/xfs_iops.c47
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_log_recover.c114
-rw-r--r--fs/xfs/xfs_mount.c18
-rw-r--r--fs/xfs/xfs_qm.c40
-rw-r--r--fs/xfs/xfs_qm_syscalls.c40
-rw-r--r--fs/xfs/xfs_quota.h2
-rw-r--r--fs/xfs/xfs_super.c11
-rw-r--r--fs/xfs/xfs_symlink.c20
-rw-r--r--fs/xfs/xfs_trace.h15
-rw-r--r--fs/xfs/xfs_vnodeops.c4
-rw-r--r--fs/xfs/xfs_vnodeops.h3
273 files changed, 7549 insertions, 6813 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
* @offset: offset in the page
*/
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* If called with zero offset, we should release
* the private state assocated with the page
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
v9fs_fscache_invalidate_page(page);
}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index be1e34adc3c6..4d0c2e0be7e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
}
/**
- * v9fs_dir_readdir - read a directory
- * @filp: opened file structure
- * @dirent: directory structure ???
- * @filldir: function to populate directory structure ???
+ * v9fs_dir_readdir - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
*
*/
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
{
- int over;
+ bool over;
struct p9_wstat st;
int err = 0;
struct p9_fid *fid;
@@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
int reclen = 0;
struct p9_rdir *rdir;
- p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
- fid = filp->private_data;
+ p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ fid = file->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
- rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ rdir = v9fs_alloc_rdir_buf(file, buflen);
if (!rdir)
return -ENOMEM;
while (1) {
if (rdir->tail == rdir->head) {
- err = v9fs_file_readn(filp, rdir->buf, NULL,
- buflen, filp->f_pos);
+ err = v9fs_file_readn(file, rdir->buf, NULL,
+ buflen, ctx->pos);
if (err <= 0)
return err;
@@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
reclen = st.size+2;
- over = filldir(dirent, st.name, strlen(st.name),
- filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
-
+ over = !dir_emit(ctx, st.name, strlen(st.name),
+ v9fs_qid2ino(&st.qid), dt_type(&st));
p9stat_free(&st);
-
if (over)
return 0;
rdir->head += reclen;
- filp->f_pos += reclen;
+ ctx->pos += reclen;
}
}
}
/**
- * v9fs_dir_readdir_dotl - read a directory
- * @filp: opened file structure
- * @dirent: buffer to fill dirent structures
- * @filldir: function to populate dirent structures
+ * v9fs_dir_readdir_dotl - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
*
*/
-static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
- filldir_t filldir)
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
{
- int over;
int err = 0;
struct p9_fid *fid;
int buflen;
struct p9_rdir *rdir;
struct p9_dirent curdirent;
- u64 oldoffset = 0;
- p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
- fid = filp->private_data;
+ p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ fid = file->private_data;
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
- rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ rdir = v9fs_alloc_rdir_buf(file, buflen);
if (!rdir)
return -ENOMEM;
while (1) {
if (rdir->tail == rdir->head) {
err = p9_client_readdir(fid, rdir->buf, buflen,
- filp->f_pos);
+ ctx->pos);
if (err <= 0)
return err;
@@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
return -EIO;
}
- /* d_off in dirent structure tracks the offset into
- * the next dirent in the dir. However, filldir()
- * expects offset into the current dirent. Hence
- * while calling filldir send the offset from the
- * previous dirent structure.
- */
- over = filldir(dirent, curdirent.d_name,
- strlen(curdirent.d_name),
- oldoffset, v9fs_qid2ino(&curdirent.qid),
- curdirent.d_type);
- oldoffset = curdirent.d_off;
-
- if (over)
+ if (!dir_emit(ctx, curdirent.d_name,
+ strlen(curdirent.d_name),
+ v9fs_qid2ino(&curdirent.qid),
+ curdirent.d_type))
return 0;
- filp->f_pos = curdirent.d_off;
+ ctx->pos = curdirent.d_off;
rdir->head += err;
}
}
@@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
const struct file_operations v9fs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = v9fs_dir_readdir,
+ .iterate = v9fs_dir_readdir,
.open = v9fs_file_open,
.release = v9fs_dir_release,
};
@@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = {
const struct file_operations v9fs_dir_operations_dotl = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = v9fs_dir_readdir_dotl,
+ .iterate = v9fs_dir_readdir_dotl,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 9cf874ce8336..ade28bb058e3 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
static DEFINE_RWLOCK(adfs_dir_lock);
static int
-adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+adfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct object_info obj;
struct adfs_dir dir;
int ret = 0;
- if (filp->f_pos >> 32)
- goto out;
+ if (ctx->pos >> 32)
+ return 0;
ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
if (ret)
- goto out;
+ return ret;
- switch ((unsigned long)filp->f_pos) {
- case 0:
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
goto free_out;
- filp->f_pos += 1;
-
- case 1:
- if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
goto free_out;
- filp->f_pos += 1;
-
- default:
- break;
+ ctx->pos = 2;
}
read_lock(&adfs_dir_lock);
- ret = ops->setpos(&dir, filp->f_pos - 2);
+ ret = ops->setpos(&dir, ctx->pos - 2);
if (ret)
goto unlock_out;
while (ops->getnext(&dir, &obj) == 0) {
- if (filldir(dirent, obj.name, obj.name_len,
- filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
- goto unlock_out;
- filp->f_pos += 1;
+ if (!dir_emit(ctx, obj.name, obj.name_len,
+ obj.file_id, DT_UNKNOWN))
+ break;
+ ctx->pos++;
}
unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
free_out:
ops->free(&dir);
-
-out:
return ret;
}
@@ -192,7 +186,7 @@ out:
const struct file_operations adfs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = adfs_readdir,
+ .iterate = adfs_readdir,
.fsync = generic_file_fsync,
};
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index fd11a6d608ee..f1eba8c3644e 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
#include "affs.h"
-static int affs_readdir(struct file *, void *, filldir_t);
+static int affs_readdir(struct file *, struct dir_context *);
const struct file_operations affs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = affs_readdir,
+ .iterate = affs_readdir,
.fsync = affs_file_fsync,
};
@@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = {
};
static int
-affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+affs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- struct buffer_head *dir_bh;
- struct buffer_head *fh_bh;
+ struct buffer_head *dir_bh = NULL;
+ struct buffer_head *fh_bh = NULL;
unsigned char *name;
int namelen;
u32 i;
int hash_pos;
int chain_pos;
- u32 f_pos;
u32 ino;
- int stored;
- int res;
- pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+ pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
- stored = 0;
- res = -EIO;
- dir_bh = NULL;
- fh_bh = NULL;
- f_pos = filp->f_pos;
-
- if (f_pos == 0) {
- filp->private_data = (void *)0;
- if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
+ if (ctx->pos < 2) {
+ file->private_data = (void *)0;
+ if (!dir_emit_dots(file, ctx))
return 0;
- filp->f_pos = f_pos = 1;
- stored++;
- }
- if (f_pos == 1) {
- if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
- return stored;
- filp->f_pos = f_pos = 2;
- stored++;
}
affs_lock_dir(inode);
- chain_pos = (f_pos - 2) & 0xffff;
- hash_pos = (f_pos - 2) >> 16;
+ chain_pos = (ctx->pos - 2) & 0xffff;
+ hash_pos = (ctx->pos - 2) >> 16;
if (chain_pos == 0xffff) {
affs_warning(sb, "readdir", "More than 65535 entries in chain");
chain_pos = 0;
hash_pos++;
- filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+ ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
}
dir_bh = affs_bread(sb, inode->i_ino);
if (!dir_bh)
@@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* If the directory hasn't changed since the last call to readdir(),
* we can jump directly to where we left off.
*/
- ino = (u32)(long)filp->private_data;
- if (ino && filp->f_version == inode->i_version) {
+ ino = (u32)(long)file->private_data;
+ if (ino && file->f_version == inode->i_version) {
pr_debug("AFFS: readdir() left off=%d\n", ino);
goto inside;
}
@@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
affs_error(sb, "readdir","Cannot read block %d", i);
- goto readdir_out;
+ return -EIO;
}
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
@@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
if (!ino)
continue;
- f_pos = (hash_pos << 16) + 2;
+ ctx->pos = (hash_pos << 16) + 2;
inside:
do {
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
affs_error(sb, "readdir","Cannot read block %d", ino);
- goto readdir_done;
+ break;
}
namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
name = AFFS_TAIL(sb, fh_bh)->name + 1;
pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
- namelen, name, ino, hash_pos, f_pos);
- if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
+ namelen, name, ino, hash_pos, (u32)ctx->pos);
+ if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
goto readdir_done;
- stored++;
- f_pos++;
+ ctx->pos++;
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
fh_bh = NULL;
} while (ino);
}
readdir_done:
- filp->f_pos = f_pos;
- filp->f_version = inode->i_version;
- filp->private_data = (void *)(long)ino;
- res = stored;
+ file->f_version = inode->i_version;
+ file->private_data = (void *)(long)ino;
readdir_out:
affs_brelse(dir_bh);
affs_brelse(fh_bh);
affs_unlock_dir(inode);
- pr_debug("AFFS: readdir()=%d\n", stored);
- return res;
+ return 0;
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7a465ed04444..34494fbead0a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,7 @@
static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
const struct file_operations afs_dir_file_operations = {
.open = afs_dir_open,
.release = afs_release,
- .readdir = afs_readdir,
+ .iterate = afs_readdir,
.lock = afs_lock,
.llseek = generic_file_llseek,
};
@@ -119,9 +119,9 @@ struct afs_dir_page {
};
struct afs_lookup_cookie {
+ struct dir_context ctx;
struct afs_fid fid;
- const char *name;
- size_t nlen;
+ struct qstr name;
int found;
};
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
/*
* deal with one block in an AFS directory
*/
-static int afs_dir_iterate_block(unsigned *fpos,
+static int afs_dir_iterate_block(struct dir_context *ctx,
union afs_dir_block *block,
- unsigned blkoff,
- void *cookie,
- filldir_t filldir)
+ unsigned blkoff)
{
union afs_dirent *dire;
unsigned offset, next, curr;
size_t nlen;
- int tmp, ret;
+ int tmp;
- _enter("%u,%x,%p,,",*fpos,blkoff,block);
+ _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
- curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+ curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
/* walk through the block, an entry at a time */
for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
_debug("ENT[%Zu.%u]: unused",
blkoff / sizeof(union afs_dir_block), offset);
if (offset >= curr)
- *fpos = blkoff +
+ ctx->pos = blkoff +
next * sizeof(union afs_dirent);
continue;
}
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
continue;
/* found the next entry */
- ret = filldir(cookie,
- dire->u.name,
- nlen,
- blkoff + offset * sizeof(union afs_dirent),
+ if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
- filldir == afs_lookup_filldir ?
- ntohl(dire->u.unique) : DT_UNKNOWN);
- if (ret < 0) {
+ ctx->actor == afs_lookup_filldir ?
+ ntohl(dire->u.unique) : DT_UNKNOWN)) {
_leave(" = 0 [full]");
return 0;
}
- *fpos = blkoff + next * sizeof(union afs_dirent);
+ ctx->pos = blkoff + next * sizeof(union afs_dirent);
}
_leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
/*
* iterate through the data blob that lists the contents of an AFS directory
*/
-static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
- filldir_t filldir, struct key *key)
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+ struct key *key)
{
union afs_dir_block *dblock;
struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
unsigned blkoff, limit;
int ret;
- _enter("{%lu},%u,,", dir->i_ino, *fpos);
+ _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
_leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
}
/* round the file position up to the next entry boundary */
- *fpos += sizeof(union afs_dirent) - 1;
- *fpos &= ~(sizeof(union afs_dirent) - 1);
+ ctx->pos += sizeof(union afs_dirent) - 1;
+ ctx->pos &= ~(sizeof(union afs_dirent) - 1);
/* walk through the blocks in sequence */
ret = 0;
- while (*fpos < dir->i_size) {
- blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+ while (ctx->pos < dir->i_size) {
+ blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
/* fetch the appropriate page from the directory */
page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
do {
dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
sizeof(union afs_dir_block)];
- ret = afs_dir_iterate_block(fpos, dblock, blkoff,
- cookie, filldir);
+ ret = afs_dir_iterate_block(ctx, dblock, blkoff);
if (ret != 1) {
afs_dir_put_page(page);
goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
blkoff += sizeof(union afs_dir_block);
- } while (*fpos < dir->i_size && blkoff < limit);
+ } while (ctx->pos < dir->i_size && blkoff < limit);
afs_dir_put_page(page);
ret = 0;
@@ -387,23 +380,10 @@ out:
/*
* read an AFS directory
*/
-static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned fpos;
- int ret;
-
- _enter("{%Ld,{%lu}}",
- file->f_pos, file_inode(file)->i_ino);
-
- ASSERT(file->private_data != NULL);
-
- fpos = file->f_pos;
- ret = afs_dir_iterate(file_inode(file), &fpos,
- cookie, filldir, file->private_data);
- file->f_pos = fpos;
-
- _leave(" = %d", ret);
- return ret;
+ return afs_dir_iterate(file_inode(file),
+ ctx, file->private_data);
}
/*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
{
struct afs_lookup_cookie *cookie = _cookie;
- _enter("{%s,%Zu},%s,%u,,%llu,%u",
- cookie->name, cookie->nlen, name, nlen,
+ _enter("{%s,%u},%s,%u,,%llu,%u",
+ cookie->name.name, cookie->name.len, name, nlen,
(unsigned long long) ino, dtype);
/* insanity checks first */
BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
- if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+ if (cookie->name.len != nlen ||
+ memcmp(cookie->name.name, name, nlen) != 0) {
_leave(" = 0 [no]");
return 0;
}
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
struct afs_fid *fid, struct key *key)
{
- struct afs_lookup_cookie cookie;
- struct afs_super_info *as;
- unsigned fpos;
+ struct afs_super_info *as = dir->i_sb->s_fs_info;
+ struct afs_lookup_cookie cookie = {
+ .ctx.actor = afs_lookup_filldir,
+ .name = dentry->d_name,
+ .fid.vid = as->volume->vid
+ };
int ret;
_enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
- as = dir->i_sb->s_fs_info;
-
/* search the directory */
- cookie.name = dentry->d_name.name;
- cookie.nlen = dentry->d_name.len;
- cookie.fid.vid = as->volume->vid;
- cookie.found = 0;
-
- fpos = 0;
- ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
- key);
+ ret = afs_dir_iterate(dir, &cookie.ctx, key);
if (ret < 0) {
_leave(" = %d [iter]", ret);
return ret;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
#include "internal.h"
static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static int afs_launder_page(struct page *page);
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
- _enter("{%lu},%lu", page->index, offset);
+ _enter("{%lu},%u,%u", page->index, offset, length);
BUG_ON(!PageLocked(page));
/* we clean up only if the entire page is being invalidated */
- if (offset == 0) {
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/aio.c b/fs/aio.c
index c5b1a8c10411..2bbcacf74d0c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,9 +141,6 @@ static void aio_free_ring(struct kioctx *ctx)
for (i = 0; i < ctx->nr_pages; i++)
put_page(ctx->ring_pages[i]);
- if (ctx->mmap_size)
- vm_munmap(ctx->mmap_base, ctx->mmap_size);
-
if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
kfree(ctx->ring_pages);
}
@@ -307,7 +304,9 @@ static void free_ioctx(struct kioctx *ctx)
kunmap_atomic(ring);
while (atomic_read(&ctx->reqs_active) > 0) {
- wait_event(ctx->wait, head != ctx->tail);
+ wait_event(ctx->wait,
+ head != ctx->tail ||
+ atomic_read(&ctx->reqs_active) <= 0);
avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
@@ -320,11 +319,6 @@ static void free_ioctx(struct kioctx *ctx)
aio_free_ring(ctx);
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
- aio_nr -= ctx->max_reqs;
- spin_unlock(&aio_nr_lock);
-
pr_debug("freeing %p\n", ctx);
/*
@@ -433,17 +427,24 @@ static void kill_ioctx(struct kioctx *ctx)
{
if (!atomic_xchg(&ctx->dead, 1)) {
hlist_del_rcu(&ctx->list);
- /* Between hlist_del_rcu() and dropping the initial ref */
- synchronize_rcu();
/*
- * We can't punt to workqueue here because put_ioctx() ->
- * free_ioctx() will unmap the ringbuffer, and that has to be
- * done in the original process's context. kill_ioctx_rcu/work()
- * exist for exit_aio(), as in that path free_ioctx() won't do
- * the unmap.
+ * It'd be more correct to do this in free_ioctx(), after all
+ * the outstanding kiocbs have finished - but by then io_destroy
+ * has already returned, so io_setup() could potentially return
+ * -EAGAIN with no ioctxs actually in use (as far as userspace
+ * could tell).
*/
- kill_ioctx_work(&ctx->rcu_work);
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+ aio_nr -= ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
+
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
+
+ /* Between hlist_del_rcu() and dropping the initial ref */
+ call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
}
}
@@ -493,10 +494,7 @@ void exit_aio(struct mm_struct *mm)
*/
ctx->mmap_size = 0;
- if (!atomic_xchg(&ctx->dead, 1)) {
- hlist_del_rcu(&ctx->list);
- call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
- }
+ kill_ioctx(ctx);
}
}
@@ -1299,8 +1297,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
* < min_nr if the timeout specified by timeout has elapsed
* before sufficient events are available, where timeout == NULL
* specifies an infinite timeout. Note that the timeout pointed to by
- * timeout is relative and will be updated if not NULL and the
- * operation blocks. Will fail with -ENOSYS if not implemented.
+ * timeout is relative. Will fail with -ENOSYS if not implemented.
*/
SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
long, min_nr,
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 085da86e07c2..ca8e55548d98 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
.open = dcache_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
- .readdir = dcache_readdir,
+ .iterate = dcache_readdir,
.llseek = dcache_dir_lseek,
.unlocked_ioctl = autofs4_root_ioctl,
#ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
.open = autofs4_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
- .readdir = dcache_readdir,
+ .iterate = dcache_readdir,
.llseek = dcache_dir_lseek,
};
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad460bff9..7c93953030fb 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return -EIO;
}
-static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int bad_file_readdir(struct file *file, struct dir_context *ctx)
{
return -EIO;
}
@@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops =
.write = bad_file_write,
.aio_read = bad_file_aio_read,
.aio_write = bad_file_aio_write,
- .readdir = bad_file_readdir,
+ .iterate = bad_file_readdir,
.poll = bad_file_poll,
.unlocked_ioctl = bad_file_unlocked_ioctl,
.compat_ioctl = bad_file_compat_ioctl,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 8615ee89ab55..e9c75e20db32 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -31,7 +31,7 @@ MODULE_LICENSE("GPL");
/* The units the vfs expects inode->i_blocks to be in */
#define VFS_BLOCK_SIZE 512
-static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_readdir(struct file *, struct dir_context *);
static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
static int befs_readpage(struct file *file, struct page *page);
static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
static const struct file_operations befs_dir_operations = {
.read = generic_read_dir,
- .readdir = befs_readdir,
+ .iterate = befs_readdir,
.llseek = generic_file_llseek,
};
@@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
}
static int
-befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+befs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
befs_off_t value;
@@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
size_t keysize;
unsigned char d_type;
char keybuf[BEFS_NAME_LEN + 1];
- char *nlsname;
- int nlsnamelen;
- const char *dirname = filp->f_path.dentry->d_name.name;
+ const char *dirname = file->f_path.dentry->d_name.name;
befs_debug(sb, "---> befs_readdir() "
- "name %s, inode %ld, filp->f_pos %Ld",
- dirname, inode->i_ino, filp->f_pos);
+ "name %s, inode %ld, ctx->pos %Ld",
+ dirname, inode->i_ino, ctx->pos);
- result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+more:
+ result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
keybuf, &keysize, &value);
if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* Convert to NLS */
if (BEFS_SB(sb)->nls) {
+ char *nlsname;
+ int nlsnamelen;
result =
befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
if (result < 0) {
befs_debug(sb, "<--- befs_readdir() ERROR");
return result;
}
- result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
- (ino_t) value, d_type);
+ if (!dir_emit(ctx, nlsname, nlsnamelen,
+ (ino_t) value, d_type)) {
+ kfree(nlsname);
+ return 0;
+ }
kfree(nlsname);
-
} else {
- result = filldir(dirent, keybuf, keysize, filp->f_pos,
- (ino_t) value, d_type);
+ if (!dir_emit(ctx, keybuf, keysize,
+ (ino_t) value, d_type))
+ return 0;
}
+ ctx->pos++;
+ goto more;
- filp->f_pos++;
-
- befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+ befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
return 0;
}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3f422f6bb5ca..a399e6d9dc74 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
const unsigned char *name, int namelen,
struct bfs_dirent **res_dir);
-static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
struct buffer_head *bh;
struct bfs_dirent *de;
- struct bfs_sb_info *info = BFS_SB(dir->i_sb);
unsigned int offset;
int block;
- mutex_lock(&info->bfs_lock);
-
- if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
+ if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
printf("Bad f_pos=%08lx for %s:%08lx\n",
- (unsigned long)f->f_pos,
+ (unsigned long)ctx->pos,
dir->i_sb->s_id, dir->i_ino);
- mutex_unlock(&info->bfs_lock);
- return -EBADF;
+ return -EINVAL;
}
- while (f->f_pos < dir->i_size) {
- offset = f->f_pos & (BFS_BSIZE - 1);
- block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+ while (ctx->pos < dir->i_size) {
+ offset = ctx->pos & (BFS_BSIZE - 1);
+ block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
bh = sb_bread(dir->i_sb, block);
if (!bh) {
- f->f_pos += BFS_BSIZE - offset;
+ ctx->pos += BFS_BSIZE - offset;
continue;
}
do {
de = (struct bfs_dirent *)(bh->b_data + offset);
if (de->ino) {
int size = strnlen(de->name, BFS_NAMELEN);
- if (filldir(dirent, de->name, size, f->f_pos,
+ if (!dir_emit(ctx, de->name, size,
le16_to_cpu(de->ino),
- DT_UNKNOWN) < 0) {
+ DT_UNKNOWN)) {
brelse(bh);
- mutex_unlock(&info->bfs_lock);
return 0;
}
}
offset += BFS_DIRENT_SIZE;
- f->f_pos += BFS_DIRENT_SIZE;
- } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+ ctx->pos += BFS_DIRENT_SIZE;
+ } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
brelse(bh);
}
-
- mutex_unlock(&info->bfs_lock);
- return 0;
+ return 0;
}
const struct file_operations bfs_dir_operations = {
.read = generic_read_dir,
- .readdir = bfs_readdir,
+ .iterate = bfs_readdir,
.fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b4fb41558111..290e347b6db3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -918,7 +918,8 @@ again:
ref->parent, bsz, 0);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 18af6f48781a..1431a6965017 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1700,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
unsigned int j;
DECLARE_COMPLETION_ONSTACK(complete);
- bio = bio_alloc(GFP_NOFS, num_pages - i);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
if (!bio) {
printk(KERN_INFO
"btrfsic: bio_alloc() for %u pages failed!\n",
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index de6de8e60b46..02fae7f7e42c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -951,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
+ int level = btrfs_header_level(buf);
+
ret = btrfs_set_disk_extent_flags(trans, root,
buf->start,
buf->len,
- new_flags, 0);
+ new_flags, level, 0);
if (ret)
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 63c328a9ce95..d6dd49b51ba8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -88,12 +88,12 @@ struct btrfs_ordered_sum;
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
-/* for storing balance parameters in the root tree */
-#define BTRFS_BALANCE_OBJECTID -4ULL
-
/* holds quota configuration and tracking */
#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -3075,7 +3075,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
- int is_data);
+ int level, int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f26f38ccd194..eb34438ddedb 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1681,8 +1681,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
* btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
*
*/
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
- filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list)
{
struct btrfs_dir_item *di;
@@ -1704,13 +1703,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (curr->key.offset < filp->f_pos) {
+ if (curr->key.offset < ctx->pos) {
if (atomic_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
- filp->f_pos = curr->key.offset;
+ ctx->pos = curr->key.offset;
di = (struct btrfs_dir_item *)curr->data;
name = (char *)(di + 1);
@@ -1719,7 +1718,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
d_type = btrfs_filetype_table[di->type];
btrfs_disk_key_to_cpu(&location, &di->location);
- over = filldir(dirent, name, name_len, curr->key.offset,
+ over = !dir_emit(ctx, name, name_len,
location.objectid, d_type);
if (atomic_dec_and_test(&curr->refs))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1d5c5f7abe3e..a4b38f934d14 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
struct list_head *del_list);
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
- filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
/* for init */
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f75fcaf79aeb..70b962cc177d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
u64 flags_to_set;
+ int level;
unsigned int update_key:1;
unsigned int update_flags:1;
unsigned int is_data:1;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7ba7b3900cb8..65241f32d3f8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
+ if (btrfs_fs_incompat(fs_info, RAID56)) {
+ pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
+ return -EINVAL;
+ }
+
switch (args->start.cont_reading_from_srcdev_mode) {
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4e9ebe1f1827..b0292b3ead54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -152,7 +152,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
{ .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
{ .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
- { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
{ .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
{ .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
return try_release_extent_buffer(page);
}
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1513,7 +1514,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
}
root->commit_root = btrfs_root_node(root);
- BUG_ON(!root->node); /* -ENOMEM */
out:
if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
@@ -1988,30 +1988,33 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
{
free_extent_buffer(info->tree_root->node);
free_extent_buffer(info->tree_root->commit_root);
- free_extent_buffer(info->dev_root->node);
- free_extent_buffer(info->dev_root->commit_root);
- free_extent_buffer(info->extent_root->node);
- free_extent_buffer(info->extent_root->commit_root);
- free_extent_buffer(info->csum_root->node);
- free_extent_buffer(info->csum_root->commit_root);
- if (info->quota_root) {
- free_extent_buffer(info->quota_root->node);
- free_extent_buffer(info->quota_root->commit_root);
- }
-
info->tree_root->node = NULL;
info->tree_root->commit_root = NULL;
- info->dev_root->node = NULL;
- info->dev_root->commit_root = NULL;
- info->extent_root->node = NULL;
- info->extent_root->commit_root = NULL;
- info->csum_root->node = NULL;
- info->csum_root->commit_root = NULL;
+
+ if (info->dev_root) {
+ free_extent_buffer(info->dev_root->node);
+ free_extent_buffer(info->dev_root->commit_root);
+ info->dev_root->node = NULL;
+ info->dev_root->commit_root = NULL;
+ }
+ if (info->extent_root) {
+ free_extent_buffer(info->extent_root->node);
+ free_extent_buffer(info->extent_root->commit_root);
+ info->extent_root->node = NULL;
+ info->extent_root->commit_root = NULL;
+ }
+ if (info->csum_root) {
+ free_extent_buffer(info->csum_root->node);
+ free_extent_buffer(info->csum_root->commit_root);
+ info->csum_root->node = NULL;
+ info->csum_root->commit_root = NULL;
+ }
if (info->quota_root) {
+ free_extent_buffer(info->quota_root->node);
+ free_extent_buffer(info->quota_root->commit_root);
info->quota_root->node = NULL;
info->quota_root->commit_root = NULL;
}
-
if (chunk_root) {
free_extent_buffer(info->chunk_root->node);
free_extent_buffer(info->chunk_root->commit_root);
@@ -2857,8 +2860,8 @@ fail_qgroup:
btrfs_free_qgroup_config(fs_info);
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
- del_fs_roots(fs_info);
btrfs_cleanup_transaction(fs_info->tree_root);
+ del_fs_roots(fs_info);
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
@@ -3128,7 +3131,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
* caller
*/
device->flush_bio = NULL;
- bio = bio_alloc(GFP_NOFS, 0);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
if (!bio)
return -ENOMEM;
@@ -3510,15 +3513,15 @@ int close_ctree(struct btrfs_root *root)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- free_root_pointers(fs_info, 1);
-
btrfs_free_block_groups(fs_info);
+ btrfs_stop_all_workers(fs_info);
+
del_fs_roots(fs_info);
- iput(fs_info->btree_inode);
+ free_root_pointers(fs_info, 1);
- btrfs_stop_all_workers(fs_info);
+ iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3659,8 +3662,11 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
ordered_operations);
list_del_init(&btrfs_inode->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_extent_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
@@ -3782,8 +3788,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
list_del_init(&btrfs_inode->delalloc_inodes);
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&btrfs_inode->runtime_flags);
+ spin_unlock(&root->fs_info->delalloc_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
+
+ spin_lock(&root->fs_info->delalloc_lock);
}
spin_unlock(&root->fs_info->delalloc_lock);
@@ -3808,7 +3817,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
while (start <= end) {
eb = btrfs_find_tree_block(root, start,
root->leafsize);
- start += eb->len;
+ start += root->leafsize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2305b5c5cf00..df472ab1b5ac 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2070,8 +2070,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
u32 item_size;
int ret;
int err = 0;
- int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
- node->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ int metadata = !extent_op->is_data;
if (trans->aborted)
return 0;
@@ -2086,11 +2085,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
key.objectid = node->bytenr;
if (metadata) {
- struct btrfs_delayed_tree_ref *tree_ref;
-
- tree_ref = btrfs_delayed_node_to_tree_ref(node);
key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = tree_ref->level;
+ key.offset = extent_op->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = node->num_bytes;
@@ -2719,7 +2715,7 @@ out:
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
- int is_data)
+ int level, int is_data)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
@@ -2732,6 +2728,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->update_flags = 1;
extent_op->update_key = 0;
extent_op->is_data = is_data ? 1 : 0;
+ extent_op->level = level;
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
num_bytes, extent_op);
@@ -3109,6 +3106,11 @@ again:
WARN_ON(ret);
if (i_size_read(inode) > 0) {
+ ret = btrfs_check_trunc_cache_free_space(root,
+ &root->fs_info->global_block_rsv);
+ if (ret)
+ goto out_put;
+
ret = btrfs_truncate_free_space_cache(root, trans, path,
inode);
if (ret)
@@ -4562,6 +4564,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ if (fs_info->quota_root)
+ fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
update_global_block_rsv(fs_info);
@@ -6651,51 +6655,51 @@ use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
int ret;
+ bool global_updated = false;
block_rsv = get_block_rsv(trans, root);
- if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
- /*
- * If we couldn't reserve metadata bytes try and use some from
- * the global reserve.
- */
- if (ret && block_rsv != global_rsv) {
- ret = block_rsv_use_bytes(global_rsv, blocksize);
- if (!ret)
- return global_rsv;
- return ERR_PTR(ret);
- } else if (ret) {
- return ERR_PTR(ret);
- }
+ if (unlikely(block_rsv->size == 0))
+ goto try_reserve;
+again:
+ ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
return block_rsv;
+
+ if (block_rsv->failfast)
+ return ERR_PTR(ret);
+
+ if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+ global_updated = true;
+ update_global_block_rsv(root->fs_info);
+ goto again;
}
- ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "btrfs: block rsv returned %d\n", ret);
+ }
+try_reserve:
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
- if (ret && !block_rsv->failfast) {
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL * 10,
- /*DEFAULT_RATELIMIT_BURST*/ 1);
- if (__ratelimit(&_rs))
- WARN(1, KERN_DEBUG
- "btrfs: block rsv returned %d\n", ret);
- }
- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
- if (!ret) {
- return block_rsv;
- } else if (ret && block_rsv != global_rsv) {
- ret = block_rsv_use_bytes(global_rsv, blocksize);
- if (!ret)
- return global_rsv;
- }
+ /*
+ * If we couldn't reserve metadata bytes try and use some from
+ * the global reserve if its space type is the same as the global
+ * reservation.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+ block_rsv->space_info == global_rsv->space_info) {
+ ret = block_rsv_use_bytes(global_rsv, blocksize);
+ if (!ret)
+ return global_rsv;
}
-
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(ret);
}
static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
@@ -6763,6 +6767,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
extent_op->update_key = 1;
extent_op->update_flags = 1;
extent_op->is_data = 0;
+ extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
ins.objectid,
@@ -6934,7 +6939,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
- eb->len, flag, 0);
+ eb->len, flag,
+ btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 32d67a822e93..6bca9472f313 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -23,6 +23,7 @@
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
+static struct bio_set *btrfs_bioset;
#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(buffers);
@@ -125,10 +126,20 @@ int __init extent_io_init(void)
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
goto free_state_cache;
+
+ btrfs_bioset = bioset_create(BIO_POOL_SIZE,
+ offsetof(struct btrfs_io_bio, bio));
+ if (!btrfs_bioset)
+ goto free_buffer_cache;
return 0;
+free_buffer_cache:
+ kmem_cache_destroy(extent_buffer_cache);
+ extent_buffer_cache = NULL;
+
free_state_cache:
kmem_cache_destroy(extent_state_cache);
+ extent_state_cache = NULL;
return -ENOMEM;
}
@@ -145,6 +156,8 @@ void extent_io_exit(void)
kmem_cache_destroy(extent_state_cache);
if (extent_buffer_cache)
kmem_cache_destroy(extent_buffer_cache);
+ if (btrfs_bioset)
+ bioset_free(btrfs_bioset);
}
void extent_io_tree_init(struct extent_io_tree *tree,
@@ -1948,28 +1961,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
}
/*
- * helper function to unlock a page if all the extents in the tree
- * for that page are unlocked
- */
-static void check_page_locked(struct extent_io_tree *tree, struct page *page)
-{
- u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
- unlock_page(page);
-}
-
-/*
- * helper function to end page writeback if all the extents
- * in the tree for that page are done with writeback
- */
-static void check_page_writeback(struct extent_io_tree *tree,
- struct page *page)
-{
- end_page_writeback(page);
-}
-
-/*
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
@@ -2046,7 +2037,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
return 0;
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
bio->bi_private = &compl;
@@ -2336,7 +2327,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
return -EIO;
}
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio) {
free_io_failure(inode, failrec, 0);
return -EIO;
@@ -2398,19 +2389,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
struct extent_io_tree *tree;
u64 start;
u64 end;
- int whole_page;
do {
struct page *page = bvec->bv_page;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
+ /* We always issue full-page reads, but if some block
+ * in a page fails to read, blk_update_request() will
+ * advance bv_offset and adjust bv_len to compensate.
+ * Print a warning for nonzero offsets, and an error
+ * if they don't add up to a full page. */
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+ printk("%s page write in btrfs with offset %u and length %u\n",
+ bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+ ? KERN_ERR "partial" : KERN_INFO "incomplete",
+ bvec->bv_offset, bvec->bv_len);
- if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
- whole_page = 1;
- else
- whole_page = 0;
+ start = page_offset(page);
+ end = start + bvec->bv_offset + bvec->bv_len - 1;
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
@@ -2418,10 +2414,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
if (end_extent_writepage(page, err, start, end))
continue;
- if (whole_page)
- end_page_writeback(page);
- else
- check_page_writeback(tree, page);
+ end_page_writeback(page);
} while (bvec >= bio->bi_io_vec);
bio_put(bio);
@@ -2446,7 +2439,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_io_tree *tree;
u64 start;
u64 end;
- int whole_page;
int mirror;
int ret;
@@ -2457,19 +2449,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct page *page = bvec->bv_page;
struct extent_state *cached = NULL;
struct extent_state *state;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%ld\n", (u64)bio->bi_sector, err,
- (long int)bio->bi_bdev);
+ "mirror=%lu\n", (u64)bio->bi_sector, err,
+ io_bio->mirror_num);
tree = &BTRFS_I(page->mapping->host)->io_tree;
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
+ /* We always issue full-page reads, but if some block
+ * in a page fails to read, blk_update_request() will
+ * advance bv_offset and adjust bv_len to compensate.
+ * Print a warning for nonzero offsets, and an error
+ * if they don't add up to a full page. */
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+ printk("%s page read in btrfs with offset %u and length %u\n",
+ bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+ ? KERN_ERR "partial" : KERN_INFO "incomplete",
+ bvec->bv_offset, bvec->bv_len);
- if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
- whole_page = 1;
- else
- whole_page = 0;
+ start = page_offset(page);
+ end = start + bvec->bv_offset + bvec->bv_len - 1;
if (++bvec <= bvec_end)
prefetchw(&bvec->bv_page->flags);
@@ -2485,7 +2484,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
}
spin_unlock(&tree->lock);
- mirror = (int)(unsigned long)bio->bi_bdev;
+ mirror = io_bio->mirror_num;
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
state, mirror);
@@ -2528,39 +2527,35 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
}
unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
- if (whole_page) {
- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
+ if (uptodate) {
+ SetPageUptodate(page);
} else {
- if (uptodate) {
- check_page_uptodate(tree, page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- check_page_locked(tree, page);
+ ClearPageUptodate(page);
+ SetPageError(page);
}
+ unlock_page(page);
} while (bvec <= bvec_end);
bio_put(bio);
}
+/*
+ * this allocates from the btrfs_bioset. We're returning a bio right now
+ * but you can call btrfs_io_bio for the appropriate container_of magic
+ */
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags)
{
struct bio *bio;
- bio = bio_alloc(gfp_flags, nr_vecs);
+ bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
+ while (!bio && (nr_vecs /= 2)) {
+ bio = bio_alloc_bioset(gfp_flags,
+ nr_vecs, btrfs_bioset);
+ }
}
if (bio) {
@@ -2571,6 +2566,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+ return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+}
+
+
+/* this also allocates from the btrfs_bioset */
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+}
+
+
static int __must_check submit_one_bio(int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
@@ -2949,7 +2957,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0;
}
@@ -3988,7 +3996,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4075,7 +4083,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
out_free:
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state, GFP_NOFS);
return ret;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a2c03a175009..41fb81e7ec53 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -336,6 +336,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags);
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
struct btrfs_fs_info;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ecca6c7375a6..e53009657f0e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -197,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root,
block_group->key.objectid);
}
-int btrfs_truncate_free_space_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode)
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
{
- struct btrfs_block_rsv *rsv;
u64 needed_bytes;
- loff_t oldsize;
- int ret = 0;
-
- rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ int ret;
/* 1 for slack space, 1 for updating the inode */
needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
btrfs_calc_trans_metadata_size(root, 1);
- spin_lock(&trans->block_rsv->lock);
- if (trans->block_rsv->reserved < needed_bytes) {
- spin_unlock(&trans->block_rsv->lock);
- trans->block_rsv = rsv;
- return -ENOSPC;
- }
- spin_unlock(&trans->block_rsv->lock);
+ spin_lock(&rsv->lock);
+ if (rsv->reserved < needed_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&rsv->lock);
+ return 0;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct inode *inode)
+{
+ loff_t oldsize;
+ int ret = 0;
oldsize = i_size_read(inode);
btrfs_i_size_write(inode, 0);
@@ -232,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
-
if (ret) {
- trans->block_rsv = rsv;
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -242,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
ret = btrfs_update_inode(trans, root, inode);
if (ret)
btrfs_abort_transaction(trans, root, ret);
- trans->block_rsv = rsv;
return ret;
}
@@ -920,10 +919,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/* Make sure we can fit our crcs into the first page */
if (io_ctl.check_crcs &&
- (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
- WARN_ON(1);
+ (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
goto out_nospc;
- }
io_ctl_set_generation(&io_ctl, trans->transid);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 4dc17d8809c7..8b7f19f44961 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_path *path,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d26f67a59e36..2c66ddbbe670 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
num_bytes = trans->bytes_reserved;
/*
* 1 item for inode item insertion if need
- * 3 items for inode item update (in the worst case)
+ * 4 items for inode item update (in the worst case)
+ * 1 items for slack space if we need do truncation
* 1 item for free space object
* 3 items for pre-allocation
*/
- trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
ret = btrfs_block_rsv_add(root, trans->block_rsv,
trans->bytes_reserved,
BTRFS_RESERVE_NO_FLUSH);
@@ -468,7 +469,8 @@ again:
if (i_size_read(inode) > 0) {
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ if (ret != -ENOSPC)
+ btrfs_abort_transaction(trans, root, ret);
goto out_put;
}
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b31b3b091fc..4f9d16b70d3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -715,8 +715,10 @@ retry:
async_extent->ram_size - 1, 0);
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ ret = -ENOMEM;
goto out_free_reserve;
+ }
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
@@ -923,8 +925,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
}
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ ret = -ENOMEM;
goto out_reserve;
+ }
em->start = start;
em->orig_start = em->start;
ram_size = ins.offset;
@@ -4724,6 +4728,7 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
no_delete:
+ btrfs_remove_delayed_node(inode);
clear_inode(inode);
return;
}
@@ -4839,14 +4844,13 @@ static void inode_tree_add(struct inode *inode)
struct rb_node **p;
struct rb_node *parent;
u64 ino = btrfs_ino(inode);
-again:
- p = &root->inode_tree.rb_node;
- parent = NULL;
if (inode_unhashed(inode))
return;
-
+again:
+ parent = NULL;
spin_lock(&root->inode_lock);
+ p = &root->inode_tree.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_inode, rb_node);
@@ -5133,10 +5137,9 @@ unsigned char btrfs_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int btrfs_real_readdir(struct file *filp, void *dirent,
- filldir_t filldir)
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_item *item;
struct btrfs_dir_item *di;
@@ -5157,29 +5160,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
char tmp_name[32];
char *name_ptr;
int name_len;
- int is_curr = 0; /* filp->f_pos points to the current index? */
+ int is_curr = 0; /* ctx->pos points to the current index? */
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
key_type = BTRFS_DIR_ITEM_KEY;
- /* special case for "." */
- if (filp->f_pos == 0) {
- over = filldir(dirent, ".", 1,
- filp->f_pos, btrfs_ino(inode), DT_DIR);
- if (over)
- return 0;
- filp->f_pos = 1;
- }
- /* special case for .., just use the back ref */
- if (filp->f_pos == 1) {
- u64 pino = parent_ino(filp->f_path.dentry);
- over = filldir(dirent, "..", 2,
- filp->f_pos, pino, DT_DIR);
- if (over)
- return 0;
- filp->f_pos = 2;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5193,7 +5182,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
}
btrfs_set_key_type(&key, key_type);
- key.offset = filp->f_pos;
+ key.offset = ctx->pos;
key.objectid = btrfs_ino(inode);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5219,14 +5208,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
break;
if (btrfs_key_type(&found_key) != key_type)
break;
- if (found_key.offset < filp->f_pos)
+ if (found_key.offset < ctx->pos)
goto next;
if (key_type == BTRFS_DIR_INDEX_KEY &&
btrfs_should_delete_dir_index(&del_list,
found_key.offset))
goto next;
- filp->f_pos = found_key.offset;
+ ctx->pos = found_key.offset;
is_curr = 1;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -5270,9 +5259,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
over = 0;
goto skip;
}
- over = filldir(dirent, name_ptr, name_len,
- found_key.offset, location.objectid,
- d_type);
+ over = !dir_emit(ctx, name_ptr, name_len,
+ location.objectid, d_type);
skip:
if (name_ptr != tmp_name)
@@ -5291,9 +5279,8 @@ next:
if (key_type == BTRFS_DIR_INDEX_KEY) {
if (is_curr)
- filp->f_pos++;
- ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
- &ins_list);
+ ctx->pos++;
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
if (ret)
goto nopos;
}
@@ -5304,9 +5291,9 @@ next:
* 32-bit glibc will use getdents64, but then strtol -
* so the last number we can serve is this.
*/
- filp->f_pos = 0x7fffffff;
+ ctx->pos = 0x7fffffff;
else
- filp->f_pos++;
+ ctx->pos++;
nopos:
ret = 0;
err:
@@ -6928,7 +6915,11 @@ struct btrfs_dio_private {
/* IO errors */
int errors;
+ /* orig_bio is our btrfs_io_bio */
struct bio *orig_bio;
+
+ /* dio_bio came from fs/direct-io.c */
+ struct bio *dio_bio;
};
static void btrfs_endio_direct_read(struct bio *bio, int err)
@@ -6938,6 +6929,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct bio_vec *bvec = bio->bi_io_vec;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct bio *dio_bio;
u64 start;
start = dip->logical_offset;
@@ -6977,14 +6969,15 @@ failed:
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
- bio->bi_private = dip->private;
+ dio_bio = dip->dio_bio;
kfree(dip);
/* If we had a csum failure make sure to clear the uptodate flag */
if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- dio_end_io(bio, err);
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ dio_end_io(dio_bio, err);
+ bio_put(bio);
}
static void btrfs_endio_direct_write(struct bio *bio, int err)
@@ -6995,6 +6988,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct btrfs_ordered_extent *ordered = NULL;
u64 ordered_offset = dip->logical_offset;
u64 ordered_bytes = dip->bytes;
+ struct bio *dio_bio;
int ret;
if (err)
@@ -7022,14 +7016,15 @@ out_test:
goto again;
}
out_done:
- bio->bi_private = dip->private;
+ dio_bio = dip->dio_bio;
kfree(dip);
/* If we had an error make sure to clear the uptodate flag */
if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- dio_end_io(bio, err);
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ dio_end_io(dio_bio, err);
+ bio_put(bio);
}
static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -7065,10 +7060,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
if (!atomic_dec_and_test(&dip->pending_bios))
goto out;
- if (dip->errors)
+ if (dip->errors) {
bio_io_error(dip->orig_bio);
- else {
- set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+ } else {
+ set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
bio_endio(dip->orig_bio, 0);
}
out:
@@ -7243,25 +7238,34 @@ out_err:
return 0;
}
-static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
- loff_t file_offset)
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+ struct inode *inode, loff_t file_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct bio_vec *bvec = dio_bio->bi_io_vec;
+ struct bio *io_bio;
int skip_sum;
int write = rw & REQ_WRITE;
int ret = 0;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+
+ if (!io_bio) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+
dip = kmalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
- goto free_ordered;
+ goto free_io_bio;
}
- dip->private = bio->bi_private;
+ dip->private = dio_bio->bi_private;
+ io_bio->bi_private = dio_bio->bi_private;
dip->inode = inode;
dip->logical_offset = file_offset;
@@ -7269,22 +7273,27 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
do {
dip->bytes += bvec->bv_len;
bvec++;
- } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+ } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
- dip->disk_bytenr = (u64)bio->bi_sector << 9;
- bio->bi_private = dip;
+ dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+ io_bio->bi_private = dip;
dip->errors = 0;
- dip->orig_bio = bio;
+ dip->orig_bio = io_bio;
+ dip->dio_bio = dio_bio;
atomic_set(&dip->pending_bios, 0);
if (write)
- bio->bi_end_io = btrfs_endio_direct_write;
+ io_bio->bi_end_io = btrfs_endio_direct_write;
else
- bio->bi_end_io = btrfs_endio_direct_read;
+ io_bio->bi_end_io = btrfs_endio_direct_read;
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
+
+free_io_bio:
+ bio_put(io_bio);
+
free_ordered:
/*
* If this is a write, we need to clean up the reserved space and kill
@@ -7300,7 +7309,7 @@ free_ordered:
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
- bio_endio(bio, ret);
+ bio_endio(dio_bio, ret);
}
static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
@@ -7484,7 +7493,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
}
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree;
@@ -7979,7 +7989,6 @@ void btrfs_destroy_inode(struct inode *inode)
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
- btrfs_remove_delayed_node(inode);
call_rcu(&inode->i_rcu, btrfs_i_callback);
}
@@ -7987,6 +7996,9 @@ int btrfs_drop_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ if (root == NULL)
+ return 1;
+
/* the snap/subvol tree is on deleting */
if (btrfs_root_refs(&root->root_item) == 0 &&
root != root->fs_info->tree_root)
@@ -8703,7 +8715,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = btrfs_real_readdir,
+ .iterate = btrfs_real_readdir,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0de4a2fcfb24..0f81d67cdc8d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1801,7 +1801,11 @@ static noinline int copy_to_sk(struct btrfs_root *root,
item_off = btrfs_item_ptr_offset(leaf, i);
item_len = btrfs_item_size_nr(leaf, i);
- if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+ btrfs_item_key_to_cpu(leaf, key, i);
+ if (!key_in_sk(key, sk))
+ continue;
+
+ if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
item_len = 0;
if (sizeof(sh) + item_len + *sk_offset >
@@ -1810,10 +1814,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
goto overflow;
}
- btrfs_item_key_to_cpu(leaf, key, i);
- if (!key_in_sk(key, sk))
- continue;
-
sh.objectid = key->objectid;
sh.offset = key->offset;
sh.type = key->type;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0740621daf6c..0525e1389f5b 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1050,7 +1050,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
if (!bio)
return -ENOMEM;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 704a1b8d2a2b..4febca4fc2de 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1773,7 +1773,7 @@ again:
if (!eb || !extent_buffer_uptodate(eb)) {
ret = (!eb) ? -ENOMEM : -EIO;
free_extent_buffer(eb);
- return ret;
+ break;
}
btrfs_tree_lock(eb);
if (cow) {
@@ -3350,6 +3350,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
}
truncate:
+ ret = btrfs_check_trunc_cache_free_space(root,
+ &fs_info->global_block_rsv);
+ if (ret)
+ goto out;
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -4077,7 +4082,7 @@ out:
return inode;
}
-static struct reloc_control *alloc_reloc_control(void)
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
struct reloc_control *rc;
@@ -4088,7 +4093,8 @@ static struct reloc_control *alloc_reloc_control(void)
INIT_LIST_HEAD(&rc->reloc_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks, NULL);
+ extent_io_tree_init(&rc->processed_blocks,
+ fs_info->btree_inode->i_mapping);
return rc;
}
@@ -4105,7 +4111,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
int rw = 0;
int err = 0;
- rc = alloc_reloc_control();
+ rc = alloc_reloc_control(fs_info);
if (!rc)
return -ENOMEM;
@@ -4306,7 +4312,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = alloc_reloc_control();
+ rc = alloc_reloc_control(root->fs_info);
if (!rc) {
err = -ENOMEM;
goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f489e24659a4..79bd479317cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1296,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!page->page);
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio) {
page->io_error = 1;
sblock->no_io_error_seen = 0;
@@ -1431,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
bio->bi_bdev = page_bad->dev->bdev;
@@ -1522,7 +1522,7 @@ again:
sbio->dev = wr_ctx->tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
if (!bio) {
mutex_unlock(&wr_ctx->wr_lock);
return -ENOMEM;
@@ -1930,7 +1930,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
if (!bio)
return -ENOMEM;
sbio->bio = bio;
@@ -3307,7 +3307,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
return -EIO;
}
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a4807ced23cc..f0857e092a3c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
ret = btrfs_commit_super(root);
if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e925ced971b..8bffb9174afb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3120,14 +3120,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
if (num_devices == 1)
allowed |= BTRFS_BLOCK_GROUP_DUP;
- else if (num_devices < 4)
+ else if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
- else
- allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6);
-
+ if (num_devices > 2)
+ allowed |= BTRFS_BLOCK_GROUP_RAID5;
+ if (num_devices > 3)
+ allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID6);
if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->data.target, 1) ||
(bctl->data.target & ~allowed))) {
@@ -5019,42 +5018,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-static void *merge_stripe_index_into_bio_private(void *bi_private,
- unsigned int stripe_index)
-{
- /*
- * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
- * at most 1.
- * The alternative solution (instead of stealing bits from the
- * pointer) would be to allocate an intermediate structure
- * that contains the old private pointer plus the stripe_index.
- */
- BUG_ON((((uintptr_t)bi_private) & 3) != 0);
- BUG_ON(stripe_index > 3);
- return (void *)(((uintptr_t)bi_private) | stripe_index);
-}
-
-static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
-{
- return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
-}
-
-static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
-{
- return (unsigned int)((uintptr_t)bi_private) & 3;
-}
-
static void btrfs_end_bio(struct bio *bio, int err)
{
- struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
+ struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
if (err) {
atomic_inc(&bbio->error);
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
- extract_stripe_index_from_bio_private(
- bio->bi_private);
+ btrfs_io_bio(bio)->stripe_index;
struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
@@ -5084,8 +5057,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
}
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
- bio->bi_bdev = (struct block_device *)
- (unsigned long)bbio->mirror_num;
+ btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
@@ -5211,8 +5183,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
bio->bi_private = bbio;
- bio->bi_private = merge_stripe_index_into_bio_private(
- bio->bi_private, (unsigned int)dev_nr);
+ btrfs_io_bio(bio)->stripe_index = dev_nr;
bio->bi_end_io = btrfs_end_bio;
bio->bi_sector = physical >> 9;
#ifdef DEBUG
@@ -5273,8 +5244,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
if (atomic_dec_and_test(&bbio->stripes_pending)) {
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
- bio->bi_bdev = (struct block_device *)
- (unsigned long)bbio->mirror_num;
+ btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_sector = logical >> 9;
kfree(bbio);
bio_endio(bio, -EIO);
@@ -5352,7 +5322,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
}
if (dev_nr < total_devs - 1) {
- bio = bio_clone(first_bio, GFP_NOFS);
+ bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 845ccbb0d2e3..f6247e2a47f7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -152,6 +152,26 @@ struct btrfs_fs_devices {
int rotating;
};
+/*
+ * we need the mirror number and stripe index to be passed around
+ * the call chain while we are processing end_io (especially errors).
+ * Really, what we need is a btrfs_bio structure that has this info
+ * and is properly sized with its stripe array, but we're not there
+ * quite yet. We have our own btrfs bioset, and all of the bios
+ * we allocate are actually btrfs_io_bios. We'll cram as much of
+ * struct btrfs_bio as we can into this over time.
+ */
+struct btrfs_io_bio {
+ unsigned long mirror_num;
+ unsigned long stripe_index;
+ struct bio bio;
+};
+
+static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_io_bio, bio);
+}
+
struct btrfs_bio_stripe {
struct btrfs_device *dev;
u64 physical;
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..f93392e2df12 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1454,7 +1454,8 @@ static void discard_buffer(struct buffer_head * bh)
* block_invalidatepage - invalidate part or all of a buffer-backed page
*
* @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
* block_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
@@ -1465,15 +1466,22 @@ static void discard_buffer(struct buffer_head * bh)
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
+ unsigned int stop = length + offset;
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
goto out;
+ /*
+ * Check for overflow
+ */
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
@@ -1481,6 +1489,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
next = bh->b_this_page;
/*
+ * Are we still fully in range ?
+ */
+ if (next_off > stop)
+ goto out;
+
+ /*
* is this block fully invalidated?
*/
if (offset <= curr_off)
@@ -1501,6 +1515,7 @@ out:
}
EXPORT_SYMBOL(block_invalidatepage);
+
/*
* We attach and possibly dirty the buffers atomically wrt
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2841,7 +2856,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
* they may have been added in ext3_writepage(). Make them
* freeable here, so the page does not leak.
*/
- do_invalidatepage(page, 0);
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
#include <linux/mount.h>
#include "internal.h"
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
struct cachefiles_lookup_data {
struct cachefiles_xattr *auxdata; /* auxiliary data */
char *key; /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
object = container_of(_object, struct cachefiles_object, fscache);
cache = container_of(object->fscache.cache, struct cachefiles_cache,
cache);
+
+ if (!fscache_use_cookie(_object)) {
+ _leave(" [relinq]");
+ return;
+ }
+
cookie = object->fscache.cookie;
if (!cookie->def->get_aux) {
+ fscache_unuse_cookie(_object);
_leave(" [no aux]");
return;
}
auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
if (!auxdata) {
+ fscache_unuse_cookie(_object);
_leave(" [nomem]");
return;
}
auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+ fscache_unuse_cookie(_object);
ASSERTCMP(auxlen, <, 511);
auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
#endif
/* delete retired objects */
- if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+ if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
_object != cache->cache.fsdef
) {
_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
printk(KERN_ERR "%sobject: OBJ%x\n",
prefix, object->fscache.debug_id);
printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
- prefix, fscache_object_states[object->fscache.state],
+ prefix, object->fscache.state->name,
object->fscache.flags, work_busy(&object->fscache.work),
object->fscache.events, object->fscache.event_mask);
printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
found_dentry:
kdebug("preemptive burial: OBJ%x [%s] %p",
object->fscache.debug_id,
- fscache_object_states[object->fscache.state],
+ object->fscache.state->name,
dentry);
- if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
wait_for_old_object:
- if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
/* look up the victim */
- mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..38b5c1bc6776 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
* dirty page counters appropriately. Only called if there is private
* data on the page.
*/
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
if (!PageDirty(page))
pr_err("%p invalidatepage %p page not dirty\n", inode, page);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ dout("%p invalidatepage %p idx %lu full dirty page\n",
+ inode, page, page->index);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
page->private = 0;
ClearPagePrivate(page);
} else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
+ dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
+ inode, page, page->index, offset, length);
}
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f02d82b7933e..a40ceda47a32 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -111,11 +111,10 @@ static unsigned fpos_off(loff_t p)
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = filp->private_data;
- struct dentry *parent = filp->f_dentry;
+ struct ceph_file_info *fi = file->private_data;
+ struct dentry *parent = file->f_dentry;
struct inode *dir = parent->d_inode;
struct list_head *p;
struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
last = fi->dentry;
fi->dentry = NULL;
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+ dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
last);
spin_lock(&parent->d_lock);
/* start at beginning? */
- if (filp->f_pos == 2 || last == NULL ||
- filp->f_pos < ceph_dentry(last)->offset) {
+ if (ctx->pos == 2 || last == NULL ||
+ ctx->pos < ceph_dentry(last)->offset) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
if (!d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- filp->f_pos <= di->offset)
+ ctx->pos <= di->offset)
break;
dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
dentry->d_name.len, dentry->d_name.name, di->offset,
- filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+ ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
!dentry->d_inode ? " null" : "");
spin_unlock(&dentry->d_lock);
p = p->prev;
@@ -173,29 +172,27 @@ more:
spin_unlock(&dentry->d_lock);
spin_unlock(&parent->d_lock);
- dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+ dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- filp->f_pos = di->offset;
- err = filldir(dirent, dentry->d_name.name,
- dentry->d_name.len, di->offset,
+ ctx->pos = di->offset;
+ if (!dir_emit(ctx, dentry->d_name.name,
+ dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
- dentry->d_inode->i_mode >> 12);
-
- if (last) {
- if (err < 0) {
+ dentry->d_inode->i_mode >> 12)) {
+ if (last) {
/* remember our position */
fi->dentry = last;
fi->next_offset = di->offset;
- } else {
- dput(last);
}
+ dput(dentry);
+ return 0;
}
- last = dentry;
- if (err < 0)
- goto out;
+ if (last)
+ dput(last);
+ last = dentry;
- filp->f_pos++;
+ ctx->pos++;
/* make sure a dentry wasn't dropped while we didn't have parent lock */
if (!ceph_dir_is_complete(dir)) {
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
return 0;
}
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = filp->private_data;
- struct inode *inode = file_inode(filp);
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned frag = fpos_frag(filp->f_pos);
- int off = fpos_off(filp->f_pos);
+ unsigned frag = fpos_frag(ctx->pos);
+ int off = fpos_off(ctx->pos);
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
const int max_entries = fsc->mount_options->max_readdir;
const int max_bytes = fsc->mount_options->max_readdir_bytes;
- dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+ dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
if (fi->flags & CEPH_F_ATEND)
return 0;
/* always start with . and .. */
- if (filp->f_pos == 0) {
+ if (ctx->pos == 0) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
fi->dir_release_count = atomic_read(&ci->i_release_count);
dout("readdir off 0 -> '.'\n");
- if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+ if (!dir_emit(ctx, ".", 1,
ceph_translate_ino(inode->i_sb, inode->i_ino),
- inode->i_mode >> 12) < 0)
+ inode->i_mode >> 12))
return 0;
- filp->f_pos = 1;
+ ctx->pos = 1;
off = 1;
}
- if (filp->f_pos == 1) {
- ino_t ino = parent_ino(filp->f_dentry);
+ if (ctx->pos == 1) {
+ ino_t ino = parent_ino(file->f_dentry);
dout("readdir off 1 -> '..'\n");
- if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+ if (!dir_emit(ctx, "..", 2,
ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12) < 0)
+ inode->i_mode >> 12))
return 0;
- filp->f_pos = 2;
+ ctx->pos = 2;
off = 2;
}
/* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
- if ((filp->f_pos == 2 || fi->dentry) &&
+ if ((ctx->pos == 2 || fi->dentry) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(filp, dirent, filldir);
+ err = __dcache_readdir(file, ctx);
if (err != -EAGAIN)
return err;
} else {
@@ -327,7 +324,7 @@ more:
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
- req->r_dentry = dget(filp->f_dentry);
+ req->r_dentry = dget(file->f_dentry);
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
req->r_direct_hash = ceph_frag_value(frag);
@@ -379,15 +376,16 @@ more:
rinfo = &fi->last_readdir->r_reply_info;
dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
rinfo->dir_nr, off, fi->offset);
+
+ ctx->pos = ceph_make_fpos(frag, off);
while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
- u64 pos = ceph_make_fpos(frag, off);
struct ceph_mds_reply_inode *in =
rinfo->dir_in[off - fi->offset].in;
struct ceph_vino vino;
ino_t ino;
dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, pos,
+ off, off - fi->offset, rinfo->dir_nr, ctx->pos,
rinfo->dir_dname_len[off - fi->offset],
rinfo->dir_dname[off - fi->offset], in);
BUG_ON(!in);
@@ -395,16 +393,15 @@ more:
vino.ino = le64_to_cpu(in->ino);
vino.snap = le64_to_cpu(in->snapid);
ino = ceph_vino_to_ino(vino);
- if (filldir(dirent,
+ if (!dir_emit(ctx,
rinfo->dir_dname[off - fi->offset],
rinfo->dir_dname_len[off - fi->offset],
- pos,
- ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
+ ceph_translate_ino(inode->i_sb, ino), ftype)) {
dout("filldir stopping us...\n");
return 0;
}
off++;
- filp->f_pos = pos + 1;
+ ctx->pos++;
}
if (fi->last_name) {
@@ -417,7 +414,7 @@ more:
if (!ceph_frag_is_rightmost(frag)) {
frag = ceph_frag_next(frag);
off = 0;
- filp->f_pos = ceph_make_fpos(frag, off);
+ ctx->pos = ceph_make_fpos(frag, off);
dout("readdir next frag is %x\n", frag);
goto more;
}
@@ -432,11 +429,11 @@ more:
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
dout(" marking %p complete\n", inode);
__ceph_dir_set_complete(ci, fi->dir_release_count);
- ci->i_max_offset = filp->f_pos;
+ ci->i_max_offset = ctx->pos;
}
spin_unlock(&ci->i_ceph_lock);
- dout("readdir %p filp %p done.\n", inode, filp);
+ dout("readdir %p file %p done.\n", inode, file);
return 0;
}
@@ -1268,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
const struct file_operations ceph_dir_fops = {
.read = ceph_read_dir,
- .readdir = ceph_readdir,
+ .iterate = ceph_readdir,
.llseek = ceph_dir_llseek,
.open = ceph_open,
.release = ceph_release,
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 202dd3d68be0..ebbf680378e2 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
}
/**
- * Encode the flock and fcntl locks for the given inode into the pagelist.
- * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
- * sequential flock locks.
- * Must be called with lock_flocks() already held.
- * If we encounter more of a specific lock type than expected,
- * we return the value 1.
+ * Encode the flock and fcntl locks for the given inode into the ceph_filelock
+ * array. Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected, return -ENOSPC.
*/
-int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
- int num_fcntl_locks, int num_flock_locks)
+int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
- struct ceph_filelock cephlock;
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
+ int l = 0;
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
num_fcntl_locks);
- err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
- if (err)
- goto fail;
+
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_POSIX) {
++seen_fcntl;
@@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &cephlock);
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
+ ++l;
}
- if (err)
- goto fail;
}
-
- err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
- if (err)
- goto fail;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_FLOCK) {
++seen_flock;
@@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &cephlock);
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
+ ++l;
}
- if (err)
- goto fail;
}
fail:
return err;
}
+/**
+ * Copy the encoded flock and fcntl locks into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Returns zero on success.
+ */
+int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks)
+{
+ int err = 0;
+ __le32 nlocks;
+
+ nlocks = cpu_to_le32(num_fcntl_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ err = ceph_pagelist_append(pagelist, flocks,
+ num_fcntl_locks * sizeof(*flocks));
+ if (err)
+ goto out_fail;
+
+ nlocks = cpu_to_le32(num_flock_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ err = ceph_pagelist_append(pagelist,
+ &flocks[num_fcntl_locks],
+ num_flock_locks * sizeof(*flocks));
+out_fail:
+ return err;
+}
+
/*
* Given a pointer to a lock, convert it to a ceph filelock
*/
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4f22671a5bd4..4d2920304be8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2478,39 +2478,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->flock) {
int num_fcntl_locks, num_flock_locks;
- struct ceph_pagelist_cursor trunc_point;
-
- ceph_pagelist_set_cursor(pagelist, &trunc_point);
- do {
- lock_flocks();
- ceph_count_locks(inode, &num_fcntl_locks,
- &num_flock_locks);
- rec.v2.flock_len = (2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- unlock_flocks();
-
- /* pre-alloc pagelist */
- ceph_pagelist_truncate(pagelist, &trunc_point);
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_pagelist_reserve(pagelist,
- rec.v2.flock_len);
-
- /* encode locks */
- if (!err) {
- lock_flocks();
- err = ceph_encode_locks(inode,
- pagelist,
- num_fcntl_locks,
- num_flock_locks);
- unlock_flocks();
- }
- } while (err == -ENOSPC);
+ struct ceph_filelock *flocks;
+
+encode_again:
+ lock_flocks();
+ ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+ unlock_flocks();
+ flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock), GFP_NOFS);
+ if (!flocks) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ lock_flocks();
+ err = ceph_encode_locks_to_buffer(inode, flocks,
+ num_fcntl_locks,
+ num_flock_locks);
+ unlock_flocks();
+ if (err) {
+ kfree(flocks);
+ if (err == -ENOSPC)
+ goto encode_again;
+ goto out_free;
+ }
+ /*
+ * number of encoded locks is stable, so copy to pagelist
+ */
+ rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
+ (num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock));
+ err = ceph_pagelist_append(pagelist, &rec, reclen);
+ if (!err)
+ err = ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ kfree(flocks);
} else {
err = ceph_pagelist_append(pagelist, &rec, reclen);
}
-
out_free:
kfree(path);
out_dput:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 8696be2ff679..7ccfdb4aea2e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -822,8 +822,13 @@ extern const struct export_operations ceph_export_ops;
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
-extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
- int p_locks, int f_locks);
+extern int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks,
+ int num_flock_locks);
+extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks);
extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
/* debugfs.c */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 8e33ec65847b..58df174deb10 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/fs.h>
+#include <linux/inet.h>
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifsfs.h"
@@ -48,58 +49,74 @@ void cifs_dfs_release_automount_timer(void)
}
/**
- * cifs_get_share_name - extracts share name from UNC
- * @node_name: pointer to UNC string
+ * cifs_build_devname - build a devicename from a UNC and optional prepath
+ * @nodename: pointer to UNC string
+ * @prepath: pointer to prefixpath (or NULL if there isn't one)
*
- * Extracts sharename form full UNC.
- * i.e. strips from UNC trailing path that is not part of share
- * name and fixup missing '\' in the beginning of DFS node refferal
- * if necessary.
- * Returns pointer to share name on success or ERR_PTR on error.
- * Caller is responsible for freeing returned string.
+ * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer
+ * big enough to hold the final thing. Copy the UNC from the nodename, and
+ * concatenate the prepath onto the end of it if there is one.
+ *
+ * Returns pointer to the built string, or a ERR_PTR. Caller is responsible
+ * for freeing the returned string.
*/
-static char *cifs_get_share_name(const char *node_name)
+static char *
+cifs_build_devname(char *nodename, const char *prepath)
{
- int len;
- char *UNC;
- char *pSep;
-
- len = strlen(node_name);
- UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
- GFP_KERNEL);
- if (!UNC)
- return ERR_PTR(-ENOMEM);
+ size_t pplen;
+ size_t unclen;
+ char *dev;
+ char *pos;
+
+ /* skip over any preceding delimiters */
+ nodename += strspn(nodename, "\\");
+ if (!*nodename)
+ return ERR_PTR(-EINVAL);
- /* get share name and server name */
- if (node_name[1] != '\\') {
- UNC[0] = '\\';
- strncpy(UNC+1, node_name, len);
- len++;
- UNC[len] = 0;
- } else {
- strncpy(UNC, node_name, len);
- UNC[len] = 0;
- }
+ /* get length of UNC and set pos to last char */
+ unclen = strlen(nodename);
+ pos = nodename + unclen - 1;
- /* find server name end */
- pSep = memchr(UNC+2, '\\', len-2);
- if (!pSep) {
- cifs_dbg(VFS, "%s: no server name end in node name: %s\n",
- __func__, node_name);
- kfree(UNC);
- return ERR_PTR(-EINVAL);
+ /* trim off any trailing delimiters */
+ while (*pos == '\\') {
+ --pos;
+ --unclen;
}
- /* find sharename end */
- pSep++;
- pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
- if (pSep) {
- /* trim path up to sharename end
- * now we have share name in UNC */
- *pSep = 0;
+ /* allocate a buffer:
+ * +2 for preceding "//"
+ * +1 for delimiter between UNC and prepath
+ * +1 for trailing NULL
+ */
+ pplen = prepath ? strlen(prepath) : 0;
+ dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ pos = dev;
+ /* add the initial "//" */
+ *pos = '/';
+ ++pos;
+ *pos = '/';
+ ++pos;
+
+ /* copy in the UNC portion from referral */
+ memcpy(pos, nodename, unclen);
+ pos += unclen;
+
+ /* copy the prefixpath remainder (if there is one) */
+ if (pplen) {
+ *pos = '/';
+ ++pos;
+ memcpy(pos, prepath, pplen);
+ pos += pplen;
}
- return UNC;
+ /* NULL terminator */
+ *pos = '\0';
+
+ convert_delimiter(dev, '/');
+ return dev;
}
@@ -123,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
{
int rc;
char *mountdata = NULL;
+ const char *prepath = NULL;
int md_len;
char *tkn_e;
char *srvIP = NULL;
@@ -132,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
if (sb_mountdata == NULL)
return ERR_PTR(-EINVAL);
- *devname = cifs_get_share_name(ref->node_name);
+ if (strlen(fullpath) - ref->path_consumed)
+ prepath = fullpath + ref->path_consumed;
+
+ *devname = cifs_build_devname(ref->node_name, prepath);
if (IS_ERR(*devname)) {
rc = PTR_ERR(*devname);
*devname = NULL;
@@ -146,12 +167,14 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
goto compose_mount_options_err;
}
- /* md_len = strlen(...) + 12 for 'sep+prefixpath='
- * assuming that we have 'unc=' and 'ip=' in
- * the original sb_mountdata
+ /*
+ * In most cases, we'll be building a shorter string than the original,
+ * but we do have to assume that the address in the ip= option may be
+ * much longer than the original. Add the max length of an address
+ * string to the length of the original string to allow for worst case.
*/
- md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
- mountdata = kzalloc(md_len+1, GFP_KERNEL);
+ md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
+ mountdata = kzalloc(md_len + 1, GFP_KERNEL);
if (mountdata == NULL) {
rc = -ENOMEM;
goto compose_mount_options_err;
@@ -195,26 +218,6 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
strncat(mountdata, &sep, 1);
strcat(mountdata, "ip=");
strcat(mountdata, srvIP);
- strncat(mountdata, &sep, 1);
- strcat(mountdata, "unc=");
- strcat(mountdata, *devname);
-
- /* find & copy prefixpath */
- tkn_e = strchr(ref->node_name + 2, '\\');
- if (tkn_e == NULL) {
- /* invalid unc, missing share name*/
- rc = -EINVAL;
- goto compose_mount_options_err;
- }
-
- tkn_e = strchr(tkn_e + 1, '\\');
- if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
- strncat(mountdata, &sep, 1);
- strcat(mountdata, "prefixpath=");
- if (tkn_e)
- strcat(mountdata, tkn_e + 1);
- strcat(mountdata, fullpath + ref->path_consumed);
- }
/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 72e4efee1389..540c1ccfcdb2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -372,9 +372,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_show_security(s, tcon->ses->server);
cifs_show_cache_flavor(s, cifs_sb);
- seq_printf(s, ",unc=");
- seq_escape(s, tcon->treeName, " \t\n\\");
-
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
seq_printf(s, ",multiuser");
else if (tcon->ses->user_name)
@@ -971,7 +968,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
};
const struct file_operations cifs_dir_ops = {
- .readdir = cifs_readdir,
+ .iterate = cifs_readdir,
.release = cifs_closedir,
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0e32c3446ce9..d05b3028e3b9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
-extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
/* Functions related to dir entries */
extern const struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 99eeaa17ee00..e3bc39bb9d12 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1061,6 +1061,7 @@ static int cifs_parse_security_flavors(char *value,
#endif
case Opt_sec_none:
vol->nullauth = 1;
+ vol->secFlg |= CIFSSEC_MAY_NTLM;
break;
default:
cifs_dbg(VFS, "bad security option: %s\n", value);
@@ -1257,14 +1258,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->backupuid_specified = false; /* no backup intent for a user */
vol->backupgid_specified = false; /* no backup intent for a group */
- /*
- * For now, we ignore -EINVAL errors under the assumption that the
- * unc= and prefixpath= options will be usable.
- */
- if (cifs_parse_devname(devname, vol) == -ENOMEM) {
- printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
- "device string.\n");
- goto out_nomem;
+ switch (cifs_parse_devname(devname, vol)) {
+ case 0:
+ break;
+ case -ENOMEM:
+ cifs_dbg(VFS, "Unable to allocate memory for devname.\n");
+ goto cifs_parse_mount_err;
+ case -EINVAL:
+ cifs_dbg(VFS, "Malformed UNC in devname.\n");
+ goto cifs_parse_mount_err;
+ default:
+ cifs_dbg(VFS, "Unknown error parsing devname.\n");
+ goto cifs_parse_mount_err;
}
while ((data = strsep(&options, separator)) != NULL) {
@@ -1826,7 +1831,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
#endif
if (!vol->UNC) {
- cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string or in unc= option!\n");
+ cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n");
goto cifs_parse_mount_err;
}
@@ -3274,8 +3279,8 @@ build_unc_path_to_root(const struct smb_vol *vol,
pos = full_path + unc_len;
if (pplen) {
- *pos++ = CIFS_DIR_SEP(cifs_sb);
- strncpy(pos, vol->prepath, pplen);
+ *pos = CIFS_DIR_SEP(cifs_sb);
+ strncpy(pos + 1, vol->prepath, pplen);
pos += pplen;
}
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index e7512e497611..7ede7306599f 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -34,7 +34,7 @@
/**
* dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
- * @unc: UNC path specifying the server
+ * @unc: UNC path specifying the server (with '/' as delimiter)
* @ip_addr: Where to return the IP address.
*
* The IP address will be returned in string form, and the caller is
@@ -64,7 +64,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
hostname = unc + 2;
/* Search for server name delimiter */
- sep = memchr(hostname, '\\', len);
+ sep = memchr(hostname, '/', len);
if (sep)
len = sep - hostname;
else
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d24c9f4..4d8ba8d491e5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3546,11 +3546,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
return cifs_fscache_release_page(page, gfp);
}
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fc3025199cb3..20efd81266c6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -171,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
inode->i_flags |= S_AUTOMOUNT;
- cifs_set_ops(inode);
+ if (inode->i_state & I_NEW)
+ cifs_set_ops(inode);
}
void
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 770d5a9781c1..f1213799de1a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -537,14 +537,14 @@ static int cifs_save_resume_key(const char *current_entry,
* every entry (do not increment for . or .. entry).
*/
static int
-find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
struct file *file, char **current_entry, int *num_to_ret)
{
__u16 search_flags;
int rc = 0;
int pos_in_buf = 0;
loff_t first_entry_in_buffer;
- loff_t index_to_find = file->f_pos;
+ loff_t index_to_find = pos;
struct cifsFileInfo *cfile = file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -659,8 +659,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
- void *dirent, char *scratch_buf, unsigned int max_len)
+static int cifs_filldir(char *find_entry, struct file *file,
+ struct dir_context *ctx,
+ char *scratch_buf, unsigned int max_len)
{
struct cifsFileInfo *file_info = file->private_data;
struct super_block *sb = file->f_path.dentry->d_sb;
@@ -740,13 +741,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
cifs_prime_dcache(file->f_dentry, &name, &fattr);
ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
- rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
- fattr.cf_dtype);
- return rc;
+ return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
}
-int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+int cifs_readdir(struct file *file, struct dir_context *ctx)
{
int rc = 0;
unsigned int xid;
@@ -772,103 +771,86 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
goto rddir2_exit;
}
- switch ((int) file->f_pos) {
- case 0:
- if (filldir(direntry, ".", 1, file->f_pos,
- file_inode(file)->i_ino, DT_DIR) < 0) {
- cifs_dbg(VFS, "Filldir for current dir failed\n");
- rc = -ENOMEM;
- break;
- }
- file->f_pos++;
- case 1:
- if (filldir(direntry, "..", 2, file->f_pos,
- parent_ino(file->f_path.dentry), DT_DIR) < 0) {
- cifs_dbg(VFS, "Filldir for parent dir failed\n");
- rc = -ENOMEM;
- break;
- }
- file->f_pos++;
- default:
- /* 1) If search is active,
- is in current search buffer?
- if it before then restart search
- if after then keep searching till find it */
-
- if (file->private_data == NULL) {
- rc = -EINVAL;
- free_xid(xid);
- return rc;
- }
- cifsFile = file->private_data;
- if (cifsFile->srch_inf.endOfSearch) {
- if (cifsFile->srch_inf.emptyDir) {
- cifs_dbg(FYI, "End of search, empty dir\n");
- rc = 0;
- break;
- }
- } /* else {
- cifsFile->invalidHandle = true;
- tcon->ses->server->close(xid, tcon, &cifsFile->fid);
- } */
+ if (!dir_emit_dots(file, ctx))
+ goto rddir2_exit;
- tcon = tlink_tcon(cifsFile->tlink);
- rc = find_cifs_entry(xid, tcon, file, &current_entry,
- &num_to_fill);
- if (rc) {
- cifs_dbg(FYI, "fce error %d\n", rc);
- goto rddir2_exit;
- } else if (current_entry != NULL) {
- cifs_dbg(FYI, "entry %lld found\n", file->f_pos);
- } else {
- cifs_dbg(FYI, "could not find entry\n");
+ /* 1) If search is active,
+ is in current search buffer?
+ if it before then restart search
+ if after then keep searching till find it */
+
+ if (file->private_data == NULL) {
+ rc = -EINVAL;
+ goto rddir2_exit;
+ }
+ cifsFile = file->private_data;
+ if (cifsFile->srch_inf.endOfSearch) {
+ if (cifsFile->srch_inf.emptyDir) {
+ cifs_dbg(FYI, "End of search, empty dir\n");
+ rc = 0;
goto rddir2_exit;
}
- cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
- num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
- max_len = tcon->ses->server->ops->calc_smb_size(
- cifsFile->srch_inf.ntwrk_buf_start);
- end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
-
- tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
- if (tmp_buf == NULL) {
- rc = -ENOMEM;
+ } /* else {
+ cifsFile->invalidHandle = true;
+ tcon->ses->server->close(xid, tcon, &cifsFile->fid);
+ } */
+
+ tcon = tlink_tcon(cifsFile->tlink);
+ rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
+ &num_to_fill);
+ if (rc) {
+ cifs_dbg(FYI, "fce error %d\n", rc);
+ goto rddir2_exit;
+ } else if (current_entry != NULL) {
+ cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+ } else {
+ cifs_dbg(FYI, "could not find entry\n");
+ goto rddir2_exit;
+ }
+ cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+ num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+ max_len = tcon->ses->server->ops->calc_smb_size(
+ cifsFile->srch_inf.ntwrk_buf_start);
+ end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
+ tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+ if (tmp_buf == NULL) {
+ rc = -ENOMEM;
+ goto rddir2_exit;
+ }
+
+ for (i = 0; i < num_to_fill; i++) {
+ if (current_entry == NULL) {
+ /* evaluate whether this case is an error */
+ cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n",
+ num_to_fill, i);
break;
}
-
- for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
- if (current_entry == NULL) {
- /* evaluate whether this case is an error */
- cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n",
- num_to_fill, i);
- break;
- }
- /*
- * if buggy server returns . and .. late do we want to
- * check for that here?
- */
- rc = cifs_filldir(current_entry, file, filldir,
- direntry, tmp_buf, max_len);
- if (rc == -EOVERFLOW) {
+ /*
+ * if buggy server returns . and .. late do we want to
+ * check for that here?
+ */
+ rc = cifs_filldir(current_entry, file, ctx,
+ tmp_buf, max_len);
+ if (rc) {
+ if (rc > 0)
rc = 0;
- break;
- }
-
- file->f_pos++;
- if (file->f_pos ==
- cifsFile->srch_inf.index_of_last_entry) {
- cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
- file->f_pos, tmp_buf);
- cifs_save_resume_key(current_entry, cifsFile);
- break;
- } else
- current_entry =
- nxt_dir_entry(current_entry, end_of_smb,
- cifsFile->srch_inf.info_level);
+ break;
}
- kfree(tmp_buf);
- break;
- } /* end switch */
+
+ ctx->pos++;
+ if (ctx->pos ==
+ cifsFile->srch_inf.index_of_last_entry) {
+ cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+ ctx->pos, tmp_buf);
+ cifs_save_resume_key(current_entry, cifsFile);
+ break;
+ } else
+ current_entry =
+ nxt_dir_entry(current_entry, end_of_smb,
+ cifsFile->srch_inf.info_level);
+ }
+ kfree(tmp_buf);
rddir2_exit:
free_xid(xid);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index b7d3a05c062c..87e0ee9f4465 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
struct inode *new_inode, struct dentry *new_dentry);
/* dir file-ops */
-static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+static int coda_readdir(struct file *file, struct dir_context *ctx);
/* dentry ops */
static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
static int coda_dentry_delete(const struct dentry *);
/* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
- filldir_t filldir);
+static int coda_venus_readdir(struct file *, struct dir_context *);
/* same as fs/bad_inode.c */
static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
const struct file_operations coda_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = coda_readdir,
+ .iterate = coda_readdir,
.open = coda_open,
.release = coda_release,
.fsync = coda_fsync,
@@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
/* file operations for directories */
-static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
{
struct coda_file_info *cfi;
struct file *host_file;
@@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
if (!host_file->f_op)
return -ENOTDIR;
- if (host_file->f_op->readdir)
- {
- /* potemkin case: we were handed a directory inode.
- * We can't use vfs_readdir because we have to keep the file
- * position in sync between the coda_file and the host_file.
- * and as such we need grab the inode mutex. */
+ if (host_file->f_op->iterate) {
struct inode *host_inode = file_inode(host_file);
-
mutex_lock(&host_inode->i_mutex);
- host_file->f_pos = coda_file->f_pos;
-
ret = -ENOENT;
if (!IS_DEADDIR(host_inode)) {
- ret = host_file->f_op->readdir(host_file, buf, filldir);
+ ret = host_file->f_op->iterate(host_file, ctx);
file_accessed(host_file);
}
-
- coda_file->f_pos = host_file->f_pos;
mutex_unlock(&host_inode->i_mutex);
+ return ret;
}
- else /* Venus: we must read Venus dirents from a file */
- ret = coda_venus_readdir(coda_file, buf, filldir);
-
- return ret;
+ /* Venus: we must read Venus dirents from a file */
+ return coda_venus_readdir(coda_file, ctx);
}
static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt)
}
/* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
- filldir_t filldir)
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
{
- int result = 0; /* # of entries returned */
struct coda_file_info *cfi;
struct coda_inode_info *cii;
struct file *host_file;
@@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
if (!vdir) return -ENOMEM;
- if (coda_file->f_pos == 0) {
- ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
- if (ret < 0)
- goto out;
- result++;
- coda_file->f_pos++;
- }
- if (coda_file->f_pos == 1) {
- ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
- if (ret < 0)
- goto out;
- result++;
- coda_file->f_pos++;
- }
+ if (!dir_emit_dots(coda_file, ctx))
+ goto out;
+
while (1) {
/* read entries from the directory file */
- ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+ ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
sizeof(*vdir));
if (ret < 0) {
printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,7 +482,7 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
/* Make sure we skip '.' and '..', we already got those */
if (name.name[0] == '.' && (name.len == 1 ||
- (vdir->d_name[1] == '.' && name.len == 2)))
+ (name.name[1] == '.' && name.len == 2)))
vdir->d_fileno = name.len = 0;
/* skip null entries */
@@ -520,19 +495,16 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
if (!ino) ino = vdir->d_fileno;
type = CDT2DT(vdir->d_type);
- ret = filldir(buf, name.name, name.len,
- coda_file->f_pos, ino, type);
- /* failure means no space for filling in this round */
- if (ret < 0) break;
- result++;
+ if (!dir_emit(ctx, name.name, name.len, ino, type))
+ break;
}
/* we'll always have progress because d_reclen is unsigned and
* we've already established it is non-zero. */
- coda_file->f_pos += vdir->d_reclen;
+ ctx->pos += vdir->d_reclen;
}
out:
kfree(vdir);
- return result ? result : ret;
+ return 0;
}
/* called when a cache lookup succeeds */
diff --git a/fs/compat.c b/fs/compat.c
index fc3b55dce184..6af20de2c1a3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -832,6 +832,7 @@ struct compat_old_linux_dirent {
};
struct compat_readdir_callback {
+ struct dir_context ctx;
struct compat_old_linux_dirent __user *dirent;
int result;
};
@@ -873,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
{
int error;
struct fd f = fdget(fd);
- struct compat_readdir_callback buf;
+ struct compat_readdir_callback buf = {
+ .ctx.actor = compat_fillonedir,
+ .dirent = dirent
+ };
if (!f.file)
return -EBADF;
- buf.result = 0;
- buf.dirent = dirent;
-
- error = vfs_readdir(f.file, compat_fillonedir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (buf.result)
error = buf.result;
@@ -897,6 +898,7 @@ struct compat_linux_dirent {
};
struct compat_getdents_callback {
+ struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
struct compat_linux_dirent __user *previous;
int count;
@@ -951,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
{
struct fd f;
struct compat_linux_dirent __user * lastdirent;
- struct compat_getdents_callback buf;
+ struct compat_getdents_callback buf = {
+ .ctx.actor = compat_filldir,
+ .current_dir = dirent,
+ .count = count
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -961,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, compat_filldir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- if (put_user(f.file->f_pos, &lastdirent->d_off))
+ if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
error = count - buf.count;
@@ -983,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
struct compat_getdents_callback64 {
+ struct dir_context ctx;
struct linux_dirent64 __user *current_dir;
struct linux_dirent64 __user *previous;
int count;
@@ -1036,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
{
struct fd f;
struct linux_dirent64 __user * lastdirent;
- struct compat_getdents_callback64 buf;
+ struct compat_getdents_callback64 buf = {
+ .ctx.actor = compat_filldir64,
+ .current_dir = dirent,
+ .count = count
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1046,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, compat_filldir64, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- typeof(lastdirent->d_off) d_off = f.file->f_pos;
+ typeof(lastdirent->d_off) d_off = buf.ctx.pos;
if (__put_user_unaligned(d_off, &lastdirent->d_off))
error = -EFAULT;
else
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 996cdc5abb85..5d19acfa7c6c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -66,7 +66,6 @@
#include <linux/gigaset_dev.h>
#ifdef CONFIG_BLOCK
-#include <linux/loop.h>
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <scsi/scsi.h>
@@ -954,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP)
/* Socket level stuff */
COMPATIBLE_IOCTL(FIOQSIZE)
#ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
/* md calls this on random blockdevs */
IGNORE_IOCTL(RAID_VERSION)
/* qemu/qemu-img might call these two on plain files for probing */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7aabc6ad4e9b..64e5323cbbb0 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1532,84 +1532,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
return (sd->s_mode >> 12) & 15;
}
-static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct super_block *sb = dentry->d_sb;
struct configfs_dirent * parent_sd = dentry->d_fsdata;
- struct configfs_dirent *cursor = filp->private_data;
+ struct configfs_dirent *cursor = file->private_data;
struct list_head *p, *q = &cursor->s_sibling;
ino_t ino = 0;
- int i = filp->f_pos;
- switch (i) {
- case 0:
- ino = dentry->d_inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- default:
- if (filp->f_pos == 2) {
- spin_lock(&configfs_dirent_lock);
- list_move(q, &parent_sd->s_children);
- spin_unlock(&configfs_dirent_lock);
- }
- for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
- struct configfs_dirent *next;
- const char * name;
- int len;
- struct inode *inode = NULL;
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+ if (ctx->pos == 2) {
+ spin_lock(&configfs_dirent_lock);
+ list_move(q, &parent_sd->s_children);
+ spin_unlock(&configfs_dirent_lock);
+ }
+ for (p = q->next; p != &parent_sd->s_children; p = p->next) {
+ struct configfs_dirent *next;
+ const char *name;
+ int len;
+ struct inode *inode = NULL;
+
+ next = list_entry(p, struct configfs_dirent, s_sibling);
+ if (!next->s_element)
+ continue;
- next = list_entry(p, struct configfs_dirent,
- s_sibling);
- if (!next->s_element)
- continue;
-
- name = configfs_get_name(next);
- len = strlen(name);
-
- /*
- * We'll have a dentry and an inode for
- * PINNED items and for open attribute
- * files. We lock here to prevent a race
- * with configfs_d_iput() clearing
- * s_dentry before calling iput().
- *
- * Why do we go to the trouble? If
- * someone has an attribute file open,
- * the inode number should match until
- * they close it. Beyond that, we don't
- * care.
- */
- spin_lock(&configfs_dirent_lock);
- dentry = next->s_dentry;
- if (dentry)
- inode = dentry->d_inode;
- if (inode)
- ino = inode->i_ino;
- spin_unlock(&configfs_dirent_lock);
- if (!inode)
- ino = iunique(sb, 2);
+ name = configfs_get_name(next);
+ len = strlen(name);
+
+ /*
+ * We'll have a dentry and an inode for
+ * PINNED items and for open attribute
+ * files. We lock here to prevent a race
+ * with configfs_d_iput() clearing
+ * s_dentry before calling iput().
+ *
+ * Why do we go to the trouble? If
+ * someone has an attribute file open,
+ * the inode number should match until
+ * they close it. Beyond that, we don't
+ * care.
+ */
+ spin_lock(&configfs_dirent_lock);
+ dentry = next->s_dentry;
+ if (dentry)
+ inode = dentry->d_inode;
+ if (inode)
+ ino = inode->i_ino;
+ spin_unlock(&configfs_dirent_lock);
+ if (!inode)
+ ino = iunique(sb, 2);
- if (filldir(dirent, name, len, filp->f_pos, ino,
- dt_type(next)) < 0)
- return 0;
+ if (!dir_emit(ctx, name, len, ino, dt_type(next)))
+ return 0;
- spin_lock(&configfs_dirent_lock);
- list_move(q, p);
- spin_unlock(&configfs_dirent_lock);
- p = q;
- filp->f_pos++;
- }
+ spin_lock(&configfs_dirent_lock);
+ list_move(q, p);
+ spin_unlock(&configfs_dirent_lock);
+ p = q;
+ ctx->pos++;
}
return 0;
}
@@ -1661,7 +1643,7 @@ const struct file_operations configfs_dir_operations = {
.release = configfs_dir_close,
.llseek = configfs_dir_lseek,
.read = generic_read_dir,
- .readdir = configfs_readdir,
+ .iterate = configfs_readdir,
};
int configfs_register_subsystem(struct configfs_subsystem *subsys)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 35b1c7bd18b7..e501ac3a49ff 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
/*
* Read a cramfs directory entry.
*/
-static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
char *buf;
unsigned int offset;
- int copied;
/* Offset within the thing. */
- offset = filp->f_pos;
- if (offset >= inode->i_size)
+ if (ctx->pos >= inode->i_size)
return 0;
+ offset = ctx->pos;
/* Directory entries are always 4-byte aligned */
if (offset & 3)
return -EINVAL;
@@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (!buf)
return -ENOMEM;
- copied = 0;
while (offset < inode->i_size) {
struct cramfs_inode *de;
unsigned long nextoffset;
char *name;
ino_t ino;
umode_t mode;
- int namelen, error;
+ int namelen;
mutex_lock(&read_mutex);
de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
break;
namelen--;
}
- error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
- if (error)
+ if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
break;
- offset = nextoffset;
- filp->f_pos = offset;
- copied++;
+ ctx->pos = offset = nextoffset;
}
kfree(buf);
return 0;
@@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = {
static const struct file_operations cramfs_directory_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = cramfs_readdir,
+ .iterate = cramfs_readdir,
};
static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b9085f7d8..5a23073138df 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
* If a dentry was found and moved, then it is returned. Otherwise NULL
* is returned. This matches the expected return value of ->lookup.
*
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
@@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
security_d_instantiate(dentry, inode);
d_rehash(dentry);
}
- } else
- d_add(dentry, inode);
+ } else {
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry))
+ d_rehash(dentry);
+ }
return new;
}
EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/atomic.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+ atomic_set((atomic_t *)data, val);
+ return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+ *val = atomic_read((atomic_t *)data);
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+ debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+ struct dentry *parent, atomic_t *value)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_wo);
+
+ return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
static ssize_t read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
+ buf[buf_size] = '\0';
if (strtobool(buf, &bv) == 0)
*val = bv;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
const char *buf, size_t len)
{
- strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
- strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ strlcpy(dlm_config.ci_cluster_name, buf,
+ sizeof(dlm_config.ci_cluster_name));
+ strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
return len;
}
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
if (b == 1) {
int len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > r->res_ls->ls_lvblen)
+ len = r->res_ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
lkb->lkb_lvbseq = ms->m_lvbseq;
}
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (!lkb->lkb_lvbptr)
return -ENOMEM;
len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > ls->ls_lvblen)
+ len = ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
}
return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
void dlm_stop_lockspaces(void)
{
struct dlm_ls *ls;
+ int count;
restart:
+ count = 0;
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
- if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+ if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+ count++;
continue;
+ }
spin_unlock(&lslist_lock);
log_error(ls, "no userland control daemon, stopping lockspace");
dlm_ls_stop(ls);
goto restart;
}
spin_unlock(&lslist_lock);
+
+ if (count)
+ log_print("dlm user daemon left %d lockspaces", count);
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2fd79eb..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
#include <linux/mutex.h>
#include <linux/sctp.h>
#include <linux/slab.h>
-#include <linux/sctp.h>
#include <net/sctp/sctp.h>
#include <net/ipv6.h>
@@ -126,6 +125,7 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
+ bool try_new_addr;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -144,6 +144,7 @@ struct dlm_node_addr {
struct list_head list;
int nodeid;
int addr_count;
+ int curr_addr_index;
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
};
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
}
static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
- struct sockaddr *sa_out)
+ struct sockaddr *sa_out, bool try_new_addr)
{
struct sockaddr_storage sas;
struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
- if (na && na->addr_count)
- memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+ if (na && na->addr_count) {
+ if (try_new_addr) {
+ na->curr_addr_index++;
+ if (na->curr_addr_index == na->addr_count)
+ na->curr_addr_index = 0;
+ }
+
+ memcpy(&sas, na->addr[na->curr_addr_index ],
+ sizeof(struct sockaddr_storage));
+ }
spin_unlock(&dlm_node_addrs_spin);
if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
{
struct dlm_node_addr *na;
int rv = -EEXIST;
+ int addr_i;
spin_lock(&dlm_node_addrs_spin);
list_for_each_entry(na, &dlm_node_addrs, list) {
if (!na->addr_count)
continue;
- if (!addr_compare(na->addr[0], addr))
- continue;
-
- *nodeid = na->nodeid;
- rv = 0;
- break;
+ for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+ if (addr_compare(na->addr[addr_i], addr)) {
+ *nodeid = na->nodeid;
+ rv = 0;
+ goto unlock;
+ }
+ }
}
+unlock:
spin_unlock(&dlm_node_addrs_spin);
return rv;
}
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
static void sctp_init_failed_foreach(struct connection *con)
{
+
+ /*
+ * Don't try to recover base con and handle race where the
+ * other node's assoc init creates a assoc and we get that
+ * notification, then we get a notification that our attempt
+ * failed due. This happens when we are still trying the primary
+ * address, but the other node has already tried secondary addrs
+ * and found one that worked.
+ */
+ if (!con->nodeid || con->sctp_assoc)
+ return;
+
+ log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+ con->try_new_addr = true;
con->sctp_assoc = 0;
- if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+ if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
mutex_unlock(&connections_lock);
}
+static void retry_failed_sctp_send(struct connection *recv_con,
+ struct sctp_send_failed *sn_send_failed,
+ char *buf)
+{
+ int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+ struct dlm_mhandle *mh;
+ struct connection *con;
+ char *retry_buf;
+ int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+ log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ log_print("Could not look up con for nodeid %d\n",
+ nodeid);
+ return;
+ }
+
+ mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+ if (!mh) {
+ log_print("Could not allocate buf for retry.");
+ return;
+ }
+ memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+ dlm_lowcomms_commit_buffer(mh);
+
+ /*
+ * If we got a assoc changed event before the send failed event then
+ * we only need to retry the send.
+ */
+ if (con->sctp_assoc) {
+ if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+ queue_work(send_workqueue, &con->swork);
+ } else
+ sctp_init_failed_foreach(con);
+}
+
/* Something happened to an association */
static void process_sctp_notification(struct connection *con,
struct msghdr *msg, char *buf)
{
union sctp_notification *sn = (union sctp_notification *)buf;
- if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+ switch (sn->sn_header.sn_type) {
+ case SCTP_SEND_FAILED:
+ retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+ break;
+ case SCTP_ASSOC_CHANGE:
switch (sn->sn_assoc_change.sac_state) {
-
case SCTP_COMM_UP:
case SCTP_RESTART:
{
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
log_print("connecting to %d sctp association %d",
nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+ new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+ new_con->try_new_addr = false;
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
- clear_bit(CF_INIT_PENDING, &con->flags);
+ clear_bit(CF_INIT_PENDING, &new_con->flags);
if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
queue_work(send_workqueue, &new_con->swork);
}
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
}
break;
- /* We don't know which INIT failed, so clear the PENDING flags
- * on them all. if assoc_id is zero then it will then try
- * again */
-
case SCTP_CANT_STR_ASSOC:
{
+ /* Will retry init when we get the send failed notification */
log_print("Can't start SCTP association - retrying");
- sctp_init_failed();
}
break;
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
(int)sn->sn_assoc_change.sac_assoc_id,
sn->sn_assoc_change.sac_state);
}
+ default:
+ ; /* fall through */
}
}
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
kfree(e);
}
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+ e->offset += completed;
+ e->len -= completed;
+
+ if (e->len == 0 && e->users == 0) {
+ list_del(&e->list);
+ free_entry(e);
+ }
+}
+
/* Initiate an SCTP association.
This is a special case of send_to_sock() in that we don't yet have a
peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
int addrlen;
struct kvec iov[1];
+ mutex_lock(&con->sock_mutex);
if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
- return;
-
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
+ goto unlock;
- if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+ if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+ con->try_new_addr)) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
base_con = nodeid2con(0, 0);
BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
if (list_empty(&con->writequeue)) {
spin_unlock(&con->writequeue_lock);
log_print("writequeue empty for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
- spin_unlock(&con->writequeue_lock);
/* Send the first block off the write queue */
iov[0].iov_base = page_address(e->page)+offset;
iov[0].iov_len = len;
+ spin_unlock(&con->writequeue_lock);
+
+ if (rem_addr.ss_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+ log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+ log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+ }
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
- sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+ sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
outmessage.msg_controllen = cmsg->cmsg_len;
+ sinfo->sinfo_flags |= SCTP_ADDR_OVER;
ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
}
else {
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
spin_unlock(&con->writequeue_lock);
}
+
+unlock:
+ mutex_unlock(&con->sock_mutex);
}
/* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+ result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
int result = -EINVAL, num = 1, i, addr_len;
struct connection *con = nodeid2con(0, GFP_NOFS);
int bufsize = NEEDED_RMEM;
+ int one = 1;
if (!con)
return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
goto create_delsock;
}
+ result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+ sizeof(one));
+ if (result < 0)
+ log_print("Could not set SCTP NODELAY error %d\n", result);
+
/* Init con struct */
sock->sk->sk_user_data = con;
con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
}
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
}
spin_unlock(&con->writequeue_lock);
out:
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 201f0a0d6b0a..9aa05e08060b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -68,9 +68,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
}
struct ecryptfs_getdents_callback {
- void *dirent;
+ struct dir_context ctx;
+ struct dir_context *caller;
struct dentry *dentry;
- filldir_t filldir;
int filldir_called;
int entries_written;
};
@@ -96,9 +96,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
rc);
goto out;
}
- rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+ buf->caller->pos = buf->ctx.pos;
+ rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
kfree(name);
- if (rc >= 0)
+ if (!rc)
buf->entries_written++;
out:
return rc;
@@ -107,27 +108,23 @@ out:
/**
* ecryptfs_readdir
* @file: The eCryptfs directory file
- * @dirent: Directory entry handle
- * @filldir: The filldir callback function
+ * @ctx: The actor to feed the entries to
*/
-static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
{
int rc;
struct file *lower_file;
struct inode *inode;
- struct ecryptfs_getdents_callback buf;
-
+ struct ecryptfs_getdents_callback buf = {
+ .ctx.actor = ecryptfs_filldir,
+ .caller = ctx,
+ .dentry = file->f_path.dentry
+ };
lower_file = ecryptfs_file_to_lower(file);
- lower_file->f_pos = file->f_pos;
+ lower_file->f_pos = ctx->pos;
inode = file_inode(file);
- memset(&buf, 0, sizeof(buf));
- buf.dirent = dirent;
- buf.dentry = file->f_path.dentry;
- buf.filldir = filldir;
- buf.filldir_called = 0;
- buf.entries_written = 0;
- rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
- file->f_pos = lower_file->f_pos;
+ rc = iterate_dir(lower_file, &buf.ctx);
+ ctx->pos = buf.ctx.pos;
if (rc < 0)
goto out;
if (buf.filldir_called && !buf.entries_written)
@@ -295,6 +292,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
static int
ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
+ int rc;
+
+ rc = filemap_write_and_wait(file->f_mapping);
+ if (rc)
+ return rc;
+
return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
}
@@ -338,7 +341,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
#endif
const struct file_operations ecryptfs_dir_fops = {
- .readdir = ecryptfs_readdir,
+ .iterate = ecryptfs_readdir,
.read = generic_read_dir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
@@ -359,7 +362,7 @@ const struct file_operations ecryptfs_main_fops = {
.aio_read = ecryptfs_read_update_atime,
.write = do_sync_write,
.aio_write = generic_file_aio_write,
- .readdir = ecryptfs_readdir,
+ .iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index bfb531564319..8dd524f32284 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -44,8 +44,11 @@ static ssize_t efivarfs_file_write(struct file *file,
bytes = efivar_entry_set_get_size(var, attributes, &datasize,
data, &set);
- if (!set && bytes)
+ if (!set && bytes) {
+ if (bytes == -ENOENT)
+ bytes = -EIO;
goto out;
+ }
if (bytes == -ENOENT) {
drop_nlink(inode);
@@ -76,7 +79,14 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
int err;
err = efivar_entry_size(var, &datasize);
- if (err)
+
+ /*
+ * efivarfs represents uncommitted variables with
+ * zero-length files. Reading them should return EOF.
+ */
+ if (err == -ENOENT)
+ return 0;
+ else if (err)
return err;
data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 055a9e9ca747..b72307ccdf7a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -7,40 +7,38 @@
#include <linux/buffer_head.h>
#include "efs.h"
-static int efs_readdir(struct file *, void *, filldir_t);
+static int efs_readdir(struct file *, struct dir_context *);
const struct file_operations efs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = efs_readdir,
+ .iterate = efs_readdir,
};
const struct inode_operations efs_dir_inode_operations = {
.lookup = efs_lookup,
};
-static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
- struct inode *inode = file_inode(filp);
- struct buffer_head *bh;
-
- struct efs_dir *dirblock;
- struct efs_dentry *dirslot;
- efs_ino_t inodenum;
+static int efs_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
efs_block_t block;
- int slot, namelen;
- char *nameptr;
+ int slot;
if (inode->i_size & (EFS_DIRBSIZE-1))
printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
/* work out where this entry can be found */
- block = filp->f_pos >> EFS_DIRBSIZE_BITS;
+ block = ctx->pos >> EFS_DIRBSIZE_BITS;
/* each block contains at most 256 slots */
- slot = filp->f_pos & 0xff;
+ slot = ctx->pos & 0xff;
/* look at all blocks */
while (block < inode->i_blocks) {
+ struct efs_dir *dirblock;
+ struct buffer_head *bh;
+
/* read the dir block */
bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
@@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
break;
}
- while (slot < dirblock->slots) {
- if (dirblock->space[slot] == 0) {
- slot++;
+ for (; slot < dirblock->slots; slot++) {
+ struct efs_dentry *dirslot;
+ efs_ino_t inodenum;
+ const char *nameptr;
+ int namelen;
+
+ if (dirblock->space[slot] == 0)
continue;
- }
dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
@@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
#ifdef DEBUG
printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
#endif
- if (namelen > 0) {
- /* found the next entry */
- filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-
- /* copy filename and data in dirslot */
- filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
-
- /* sanity check */
- if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
- printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
- slot++;
- continue;
- }
-
- /* store position of next slot */
- if (++slot == dirblock->slots) {
- slot = 0;
- block++;
- }
+ if (!namelen)
+ continue;
+ /* found the next entry */
+ ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
+
+ /* sanity check */
+ if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+ printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+ continue;
+ }
+
+ /* copy filename and data in dirslot */
+ if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
brelse(bh);
- filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
- goto out;
+ return 0;
}
- slot++;
}
brelse(bh);
slot = 0;
block++;
}
-
- filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-out:
+ ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index 643019585574..ffd7a813ad3d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1135,13 +1135,6 @@ void setup_new_exec(struct linux_binprm * bprm)
set_dumpable(current->mm, suid_dumpable);
}
- /*
- * Flush performance counters when crossing a
- * security domain:
- */
- if (!get_dumpable(current->mm))
- perf_event_exit_task(current);
-
/* An exec changes our domain. We are no longer part of the thread
group */
@@ -1205,6 +1198,15 @@ void install_exec_creds(struct linux_binprm *bprm)
commit_creds(bprm->cred);
bprm->cred = NULL;
+
+ /*
+ * Disable monitoring for regular users
+ * when executing setuid binaries. Must
+ * wait until new credentials are committed
+ * by commit_creds() above
+ */
+ if (get_dumpable(current->mm) != SUID_DUMP_USER)
+ perf_event_exit_task(current);
/*
* cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 46375896cfc0..49f51ab4caac 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
}
static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- unsigned char *types = NULL;
- int need_revalidate = (filp->f_version != inode->i_version);
+ int need_revalidate = (file->f_version != inode->i_version);
if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
return 0;
- types = exofs_filetype_table;
-
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (IS_ERR(page)) {
EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (offset) {
offset = exofs_validate_entry(kaddr, offset,
chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
return -EIO;
}
if (de->inode_no) {
- int over;
- unsigned char d_type = DT_UNKNOWN;
+ unsigned char t;
- if (types && de->file_type < EXOFS_FT_MAX)
- d_type = types[de->file_type];
+ if (de->file_type < EXOFS_FT_MAX)
+ t = exofs_filetype_table[de->file_type];
+ else
+ t = DT_UNKNOWN;
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
+ if (!dir_emit(ctx, de->name, de->name_len,
le64_to_cpu(de->inode_no),
- d_type);
- if (over) {
+ t)) {
exofs_put_page(page);
return 0;
}
}
- filp->f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
exofs_put_page(page);
}
-
return 0;
}
@@ -669,5 +663,5 @@ not_empty:
const struct file_operations exofs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = exofs_readdir,
+ .iterate = exofs_readdir,
};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
return 0;
}
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+ EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+ page->index, offset, length);
WARN_ON(1);
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 262fc9940982..293bc2e47a73 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
}
struct getdents_callback {
+ struct dir_context ctx;
char *name; /* name that was found. It already points to a
buffer NAME_MAX+1 is size */
unsigned long ino; /* the inum we are looking for */
@@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
struct inode *dir = path->dentry->d_inode;
int error;
struct file *file;
- struct getdents_callback buffer;
+ struct getdents_callback buffer = {
+ .ctx.actor = filldir_one,
+ .name = name,
+ .ino = child->d_inode->i_ino
+ };
error = -ENOTDIR;
if (!dir || !S_ISDIR(dir->i_mode))
@@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
goto out;
error = -EINVAL;
- if (!file->f_op->readdir)
+ if (!file->f_op->iterate)
goto out_close;
- buffer.name = name;
- buffer.ino = child->d_inode->i_ino;
- buffer.found = 0;
buffer.sequence = 0;
while (1) {
int old_seq = buffer.sequence;
- error = vfs_readdir(file, filldir_one, &buffer);
+ error = iterate_dir(file, &buffer.ctx);
if (buffer.found) {
error = 0;
break;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4237722bfd27..6e1d4ab09d72 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
}
static int
-ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
+ext2_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
unsigned char *types = NULL;
- int need_revalidate = filp->f_version != inode->i_version;
+ int need_revalidate = file->f_version != inode->i_version;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
@@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
ext2_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
return -EIO;
}
if (de->inode) {
- int over;
unsigned char d_type = DT_UNKNOWN;
if (types && de->file_type < EXT2_FT_MAX)
d_type = types[de->file_type];
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
- le32_to_cpu(de->inode), d_type);
- if (over) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ d_type)) {
ext2_put_page(page);
return 0;
}
}
- filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
+ ctx->pos += ext2_rec_len_from_disk(de->rec_len);
}
ext2_put_page(page);
}
@@ -724,7 +721,7 @@ not_empty:
const struct file_operations ext2_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ext2_readdir,
+ .iterate = ext2_readdir,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 87eccbbca255..f522425aaa24 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir);
+static int ext3_dx_readdir(struct file *, struct dir_context *);
static unsigned char get_dtype(struct super_block *sb, int filetype)
{
@@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
return error_msg == NULL ? 1 : 0;
}
-static int ext3_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
unsigned long offset;
- int i, stored;
+ int i;
struct ext3_dir_entry_2 *de;
int err;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- int ret = 0;
int dir_has_error = 0;
if (is_dx_dir(inode)) {
- err = ext3_dx_readdir(filp, dirent, filldir);
- if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
- }
+ err = ext3_dx_readdir(file, ctx);
+ if (err != ERR_BAD_DX_DIR)
+ return err;
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL;
+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
}
- stored = 0;
- offset = filp->f_pos & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < inode->i_size) {
- unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+ while (ctx->pos < inode->i_size) {
+ unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
struct buffer_head map_bh;
struct buffer_head *bh = NULL;
@@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp,
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&filp->f_ra, index))
+ if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
- &filp->f_ra, filp,
+ &file->f_ra, file,
index, 1);
- filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext3_bread(NULL, inode, blk, 0, &err);
}
@@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp,
if (!dir_has_error) {
ext3_error(sb, __func__, "directory #%lu "
"contains a hole at offset %lld",
- inode->i_ino, filp->f_pos);
+ inode->i_ino, ctx->pos);
dir_has_error = 1;
}
/* corrupt size? Maybe no more blocks to read */
- if (filp->f_pos > inode->i_blocks << 9)
+ if (ctx->pos > inode->i_blocks << 9)
break;
- filp->f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (offset && file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext3_dir_entry_2 *)
(bh->b_data + i);
@@ -177,53 +169,40 @@ revalidate:
i += ext3_rec_len_from_disk(de->rec_len);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
bh, offset)) {
- /* On error, skip the f_pos to the
+ /* On error, skip the to the
next block. */
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse (bh);
- ret = stored;
- goto out;
+ break;
}
offset += ext3_rec_len_from_disk(de->rec_len);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored ++;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+ ctx->pos += ext3_rec_len_from_disk(de->rec_len);
}
offset = 0;
brelse (bh);
+ if (ctx->pos < inode->i_size)
+ if (!dir_relax(inode))
+ return 0;
}
-out:
- return ret;
+ return 0;
}
static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file * filp, void * dirent,
- filldir_t filldir, struct fname *fname)
+static bool call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = file_inode(filp);
- struct super_block * sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
printk("call_filldir: called with null fname?!?\n");
- return 0;
+ return true;
}
- curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name, fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
+ get_dtype(sb, fname->file_type))) {
info->extra_fname = fname;
- return error;
+ return false;
}
fname = fname->next;
}
- return 0;
+ return true;
}
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = file_inode(filp);
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = ext3_htree_create_dir_info(filp, filp->f_pos);
+ info = ext3_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == ext3_get_htree_eof(filp))
+ if (ctx->pos == ext3_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp, filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
@@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp,
* chain, return them first.
*/
if (info->extra_fname) {
- if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ if (!call_filldir(file, ctx, info->extra_fname))
goto finished;
info->extra_fname = NULL;
goto next_node;
@@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp,
* cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext3_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext3_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = ext3_get_htree_eof(filp);
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (!call_filldir(file, ctx, fname))
break;
next_node:
info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = ext3_get_htree_eof(filp);
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
@@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
const struct file_operations ext3_dir_operations = {
.llseek = ext3_dir_llseek,
.read = generic_read_dir,
- .readdir = ext3_readdir,
+ .iterate = ext3_readdir,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..f67668f724ba 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
}
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
- trace_ext3_invalidatepage(page, offset);
+ trace_ext3_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- journal_invalidatepage(journal, page, offset);
+ journal_invalidatepage(journal, page, offset, length);
}
static int ext3_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13e3596..cea8ecf3e76e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse (bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext3fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..58339393fa6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
static inline int test_root(ext4_group_t a, int b)
{
- int num = b;
-
- while (a > num)
- num *= b;
- return num == a;
+ while (1) {
+ if (a < b)
+ return 0;
+ if (a == b)
+ return 1;
+ if ((a % b) != 0)
+ return 0;
+ a = a / b;
+ }
}
static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4254e0..3c7d288ae94c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
#include "ext4.h"
#include "xattr.h"
-static int ext4_dx_readdir(struct file *filp,
- void *dirent, filldir_t filldir);
+static int ext4_dx_readdir(struct file *, struct dir_context *);
/**
* Check if the given dir-inode refers to an htree-indexed directory
@@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
return 1;
}
-static int ext4_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
unsigned int offset;
int i, stored;
struct ext4_dir_entry_2 *de;
int err;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- int ret = 0;
int dir_has_error = 0;
if (is_dx_dir(inode)) {
- err = ext4_dx_readdir(filp, dirent, filldir);
+ err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
+ return err;
}
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- ext4_clear_inode_flag(file_inode(filp),
+ ext4_clear_inode_flag(file_inode(file),
EXT4_INODE_INDEX);
}
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
- ret = ext4_read_inline_dir(filp, dirent, filldir,
+ int ret = ext4_read_inline_dir(file, ctx,
&has_inline_data);
if (has_inline_data)
return ret;
}
stored = 0;
- offset = filp->f_pos & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < inode->i_size) {
+ while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
struct buffer_head *bh = NULL;
- map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
pgoff_t index = map.m_pblk >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&filp->f_ra, index))
+ if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
- &filp->f_ra, filp,
+ &file->f_ra, file,
index, 1);
- filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
}
@@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp,
*/
if (!bh) {
if (!dir_has_error) {
- EXT4_ERROR_FILE(filp, 0,
+ EXT4_ERROR_FILE(file, 0,
"directory contains a "
"hole at offset %llu",
- (unsigned long long) filp->f_pos);
+ (unsigned long long) ctx->pos);
dir_has_error = 1;
}
/* corrupt size? Maybe no more blocks to read */
- if (filp->f_pos > inode->i_blocks << 9)
+ if (ctx->pos > inode->i_blocks << 9)
break;
- filp->f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
@@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp,
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(inode,
(struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+ EXT4_ERROR_FILE(file, 0, "directory fails checksum "
"at offset %llu",
- (unsigned long long)filp->f_pos);
- filp->f_pos += sb->s_blocksize - offset;
+ (unsigned long long)ctx->pos);
+ ctx->pos += sb->s_blocksize - offset;
brelse(bh);
continue;
}
set_buffer_verified(bh);
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -214,57 +208,46 @@ revalidate:
sb->s_blocksize);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
- if (ext4_check_dir_entry(inode, filp, de, bh,
+ if (ext4_check_dir_entry(inode, file, de, bh,
bh->b_data, bh->b_size,
offset)) {
/*
- * On error, skip the f_pos to the next block
+ * On error, skip to the next block
*/
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse(bh);
- ret = stored;
- goto out;
+ break;
}
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- filp->f_pos,
le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored++;
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
offset = 0;
brelse(bh);
+ if (ctx->pos < inode->i_size) {
+ if (!dir_relax(inode))
+ return 0;
+ }
}
-out:
- return ret;
+ return 0;
}
static inline int is_32bit_api(void)
@@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file *filp, void *dirent,
- filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = file_inode(filp);
- struct super_block *sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
inode->i_ino, current->comm);
return 0;
}
- curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name,
+ fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
+ get_dtype(sb, fname->file_type))) {
info->extra_fname = fname;
- return error;
+ return 1;
}
fname = fname->next;
}
return 0;
}
-static int ext4_dx_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = file_inode(filp);
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = ext4_htree_create_dir_info(filp, filp->f_pos);
+ info = ext4_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == ext4_get_htree_eof(filp))
+ if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp, filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
@@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
* chain, return them first.
*/
if (info->extra_fname) {
- if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ if (call_filldir(file, ctx, info->extra_fname))
goto finished;
info->extra_fname = NULL;
goto next_node;
@@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
* cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext4_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = ext4_get_htree_eof(filp);
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (call_filldir(file, ctx, fname))
break;
next_node:
info->curr_node = rb_next(info->curr_node);
@@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = ext4_get_htree_eof(filp);
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
@@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
const struct file_operations ext4_dir_operations = {
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
- .readdir = ext4_readdir,
+ .iterate = ext4_readdir,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0aabb344b02e..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,33 +177,22 @@ struct ext4_map_blocks {
};
/*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
-/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_DIRECT 0x0004
+#define EXT4_IO_END_DIRECT 0x0002
/*
- * For converting uninitialized extents on a work queue.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
*/
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
+ handle_t *handle; /* handle reserved for extent
+ * conversion */
struct inode *inode; /* file being written to */
+ struct bio *bio; /* Linked list of completed
+ * bios covering the extent */
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
@@ -582,11 +571,6 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
/*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
-
-/*
* ioctl commands
*/
#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -880,6 +864,7 @@ struct ext4_inode_info {
rwlock_t i_es_lock;
struct list_head i_es_lru;
unsigned int i_es_lru_nr; /* protected by i_es_lock */
+ unsigned long i_touch_when; /* jiffies of last accessing */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -904,12 +889,22 @@ struct ext4_inode_info {
qsize_t i_reserved_quota;
#endif
- /* completed IOs that might need unwritten extents handling */
- struct list_head i_completed_io_list;
+ /* Lock protecting lists below */
spinlock_t i_completed_io_lock;
+ /*
+ * Completed IOs that need unwritten extents handling and have
+ * transaction reserved
+ */
+ struct list_head i_rsv_conversion_list;
+ /*
+ * Completed IOs that need unwritten extents handling and don't have
+ * transaction reserved
+ */
+ struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
- struct work_struct i_unwritten_work; /* deferred extent conversion */
+ struct work_struct i_rsv_conversion_work;
+ struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -1246,7 +1241,6 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
- unsigned int s_max_writeback_mb_bump;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
@@ -1282,8 +1276,10 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for dio unwritten */
- struct workqueue_struct *dio_unwritten_wq;
+ /* workqueue for unreserved extent convertions (dio) */
+ struct workqueue_struct *unrsv_conversion_wq;
+ /* workqueue for reserved extent conversions (buffered io) */
+ struct workqueue_struct *rsv_conversion_wq;
/* timer for periodic error stats printing */
struct timer_list s_err_report;
@@ -1308,6 +1304,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
+ unsigned long s_es_last_sorted;
struct percpu_counter s_extent_cache_cnt;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
@@ -1343,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ /* Writeback has to have coversion transaction reserved */
+ WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+ !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -2000,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2089,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2097,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags);
+extern int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2112,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t first, ext4_lblk_t stop);
@@ -2167,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16]);
+
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
- __LINE__, ## message)
extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
extern __printf(4, 5)
void __ext4_abort(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
- __LINE__, ## message)
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
- __LINE__, ## message)
extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
- __LINE__, msg)
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
struct super_block *, ext4_group_t,
unsigned long, ext4_fsblk_t,
const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
- __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+ __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...) \
+ __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...) \
+ __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...) \
+ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...) \
+ __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+ fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, " "); \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_file(file, "", 0, block, " "); \
+} while (0)
+#define ext4_error(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error(sb, "", 0, " "); \
+} while (0)
+#define ext4_abort(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_abort(sb, "", 0, " "); \
+} while (0)
+#define ext4_warning(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_warning(sb, "", 0, " "); \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_msg(sb, "", " "); \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
+} while (0)
+
+#endif
+
extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
@@ -2313,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
{
struct ext4_group_info ***grp_info;
long indexv, indexh;
+ BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2516,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
struct inode *parent,
struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
- void *dirent, filldir_t filldir,
+ struct dir_context *ctx,
int *has_inline_data);
extern int htree_inlinedir_to_tree(struct file *dir_file,
struct inode *dir, ext4_lblk_t block,
@@ -2599,8 +2656,7 @@ struct ext4_extent;
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
- int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2610,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2652,14 +2708,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
-extern void ext4_end_io_work(struct work_struct *work);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2672,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
extern int ext4_mmp_csum_verify(struct super_block *sb,
struct mmp_struct *mmp);
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
- = BH_JBDPrivateStart,
+ = BH_JBDPrivateStart,
BH_AllocFromCluster, /* allocated blocks were part of already
- * allocated cluster. Note that this flag will
- * never, ever appear in a buffer_head's state
- * flag. See EXT4_MAP_FROM_CLUSTER to see where
- * this is used. */
+ * allocated cluster. */
};
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
/*
* Wrappers for jbd2_journal_start/end.
*/
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks)
+static int ext4_journal_check_start(struct super_block *sb)
{
journal_t *journal;
might_sleep();
-
- trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
+ return -EROFS;
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
- return ext4_get_nojournal();
/*
* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly.
*/
- if (is_journal_aborted(journal)) {
+ if (journal && is_journal_aborted(journal)) {
ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
+ return -EROFS;
}
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+ return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks)
+{
+ journal_t *journal;
+ int err;
+
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+ type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
return err;
}
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type)
+{
+ struct super_block *sb;
+ int err;
+
+ if (!ext4_handle_valid(handle))
+ return ext4_get_nojournal();
+
+ sb = handle->h_journal->j_private;
+ trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+ _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0) {
+ jbd2_journal_free_reserved(handle);
+ return ERR_PTR(err);
+ }
+
+ err = jbd2_journal_start_reserved(handle, type, line);
+ if (err < 0)
+ return ERR_PTR(err);
+ return handle;
+}
+
void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn, struct buffer_head *bh,
handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
#define EXT4_HT_MIGRATE 8
#define EXT4_HT_MOVE_EXTENTS 9
#define EXT4_HT_XATTR 10
-#define EXT4_HT_MAX 11
+#define EXT4_HT_EXT_CONVERT 11
+#define EXT4_HT_MAX 12
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks);
+ int type, int blocks, int rsv_blocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int nblocks)
+ int blocks, int rsv_blocks)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+ return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ rsv_blocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__func__, __LINE__, (handle))
+#define ext4_journal_start_reserved(handle, type) \
+ __ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_free_reserved(handle);
+}
+
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 107936db244e..7097b0f680e6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
next_del = ext4_find_delayed_extent(inode, &es);
if (!exists && next_del) {
exists = 1;
- flags |= FIEMAP_EXTENT_DELALLOC;
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
}
up_read(&EXT4_I(inode)->i_data_sem);
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
}
/*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
*
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
*
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
*/
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
depth = ext_depth(inode);
- if (chunk)
+ if (extents <= 1)
index = depth * 2;
else
index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
return index;
}
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ return EXT4_FREE_BLOCKS_FORGET;
+ return 0;
+}
+
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
- ext4_fsblk_t *partial_cluster,
+ long long *partial_cluster,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
- int flags = 0;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
- else if (ext4_should_journal_data(inode))
- flags |= EXT4_FREE_BLOCKS_FORGET;
+ int flags = get_default_free_blocks_flags(inode);
/*
* For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* partial cluster here.
*/
pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ if ((*partial_cluster > 0) &&
+ (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
+ unsigned int unaligned;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
- ext_debug("free last %u blocks starting %llu\n", num, pblk);
+ /*
+ * Usually we want to free partial cluster at the end of the
+ * extent, except for the situation when the cluster is still
+ * used by any other extent (partial_cluster is negative).
+ */
+ if (*partial_cluster < 0 &&
+ -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+ ext_debug("free last %u blocks starting %llu partial %lld\n",
+ num, pblk, *partial_cluster);
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
/*
* If the block range to be freed didn't start at the
* beginning of a cluster, and we removed the entire
- * extent, save the partial cluster here, since we
- * might need to delete if we determine that the
- * truncate operation has removed all of the blocks in
- * the cluster.
+ * extent and the cluster is not used by any other extent,
+ * save the partial cluster here, since we might need to
+ * delete if we determine that the truncate operation has
+ * removed all of the blocks in the cluster.
+ *
+ * On the other hand, if we did not manage to free the whole
+ * extent, we have to mark the cluster as used (store negative
+ * cluster number in partial_cluster).
*/
- if (pblk & (sbi->s_cluster_ratio - 1) &&
- (ee_len == num))
+ unaligned = pblk & (sbi->s_cluster_ratio - 1);
+ if (unaligned && (ee_len == num) &&
+ (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
*partial_cluster = EXT4_B2C(sbi, pblk);
- else
+ else if (unaligned)
+ *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+ else if (*partial_cluster > 0)
*partial_cluster = 0;
- } else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- /* head removal */
- ext4_lblk_t num;
- ext4_fsblk_t start;
-
- num = to - from;
- start = ext4_ext_pblock(ex);
-
- ext_debug("free first %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, NULL, start, num, flags);
-
- } else {
- printk(KERN_INFO "strange request: removal(2) "
- "%u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- }
+ } else
+ ext4_error(sbi->s_sb, "strange request: removal(2) "
+ "%u-%u from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* @handle: The journal handle
* @inode: The files inode
* @path: The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ * has been released from it. It gets negative in case
+ * that the cluster is still used.
* @start: The first block to remove
* @end: The last block to remove
*/
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+ struct ext4_ext_path *path,
+ long long *partial_cluster,
ext4_lblk_t start, ext4_lblk_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
+ ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
return -EIO;
}
/* find where to start removing */
- ex = EXT_LAST_EXTENT(eh);
+ ex = path[depth].p_ext;
+ if (!ex)
+ ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/* If this extent is beyond the end of the hole, skip it */
if (end < ex_ee_block) {
+ /*
+ * We're going to skip this extent and move to another,
+ * so if this extent is not cluster aligned we have
+ * to mark the current cluster as used to avoid
+ * accidentally freeing it later on
+ */
+ pblk = ext4_ext_pblock(ex);
+ if (pblk & (sbi->s_cluster_ratio - 1))
+ *partial_cluster =
+ -((long long)EXT4_B2C(sbi, pblk));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
- } else
+ } else if (*partial_cluster > 0)
*partial_cluster = 0;
err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
err = ext4_ext_correct_indexes(handle, inode, path);
/*
- * If there is still a entry in the leaf node, check to see if
- * it references the partial cluster. This is the only place
- * where it could; if it doesn't, we can free the cluster.
+ * Free the partial cluster only if the current extent does not
+ * reference it. Otherwise we might free used cluster.
*/
- if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+ if (*partial_cluster > 0 &&
(EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
*partial_cluster)) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
- ext4_fsblk_t partial_cluster = 0;
+ long long partial_cluster = 0;
handle_t *handle;
int i = 0, err = 0;
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
return PTR_ERR(handle);
again:
- trace_ext4_ext_remove_space(inode, start, depth);
+ trace_ext4_ext_remove_space(inode, start, end, depth);
/*
* Check if we are removing extents inside the extent tree. If that
@@ -2844,17 +2866,14 @@ again:
}
}
- trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
- path->p_hdr->eh_entries);
+ trace_ext4_ext_remove_space_done(inode, start, end, depth,
+ partial_cluster, path->p_hdr->eh_entries);
/* If we still have something in the partial cluster and we have removed
* even the first extent, then we should free the blocks in the partial
* cluster as well. */
- if (partial_cluster && path->p_hdr->eh_entries == 0) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -3642,7 +3661,7 @@ int ext4_find_delalloc_range(struct inode *inode,
{
struct extent_status es;
- ext4_es_find_delayed_extent(inode, lblk_start, &es);
+ ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
if (es.es_len == 0)
return 0; /* there is no delay extent in this tree */
else if (es.es_lblk <= lblk_start &&
@@ -4363,7 +4382,7 @@ out2:
}
out3:
- trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+ trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
return err ? err : allocated;
}
@@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(file, offset, len);
+ return ext4_punch_hole(inode, offset, len);
ret = ext4_convert_inline_data(inode);
if (ret)
@@ -4548,10 +4567,9 @@ retry:
* function, to convert the fallocated extents after IO is completed.
* Returns 0 on success.
*/
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
{
- handle_t *handle;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
@@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
map.m_lblk);
/*
- * credits to insert 1 extent into extent tree
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
*/
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ credits = 0;
+ } else {
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ }
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
inode->i_ino, map.m_lblk,
map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- if (ret <= 0 || ret2 )
+ if (credits)
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2)
break;
}
+ if (!credits)
+ ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
@@ -4608,9 +4645,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
struct extent_status es;
ext4_lblk_t block, next_del;
- ext4_es_find_delayed_extent(inode, newes->es_lblk, &es);
-
if (newes->es_pblk == 0) {
+ ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
+ newes->es_lblk + newes->es_len - 1, &es);
+
/*
* No extent in extent-tree contains block @newes->es_pblk,
* then the block may stay in 1)a hole or 2)delayed-extent.
@@ -4630,7 +4668,7 @@ static int ext4_find_delayed_extent(struct inode *inode,
}
block = newes->es_lblk + newes->es_len;
- ext4_es_find_delayed_extent(inode, block, &es);
+ ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
if (es.es_len == 0)
next_del = EXT_MAX_BLOCKS;
else
@@ -4658,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
error = ext4_get_inode_loc(inode, &iloc);
if (error)
return error;
- physical = iloc.bh->b_blocknr << blockbits;
+ physical = (__u64)iloc.bh->b_blocknr << blockbits;
offset = EXT4_GOOD_OLD_INODE_SIZE +
EXT4_I(inode)->i_extra_isize;
physical += offset;
@@ -4666,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
} else { /* external block */
- physical = EXT4_I(inode)->i_file_acl << blockbits;
+ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index fe3337a85ede..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
* Ext4 extents status tree core functions.
*/
#include <linux/rbtree.h>
+#include <linux/list_sort.h>
#include "ext4.h"
#include "extents_status.h"
#include "ext4_extents.h"
@@ -232,14 +233,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
- * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk
- * if it exists, otherwise, the next extent after @es->lblk.
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
*
* @inode: the inode which owns delayed extents
* @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
* @es: delayed extent that we found
*/
-void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
@@ -247,7 +250,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
struct rb_node *node;
BUG_ON(es == NULL);
- trace_ext4_es_find_delayed_extent_enter(inode, lblk);
+ BUG_ON(end < lblk);
+ trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
@@ -270,6 +274,10 @@ out:
if (es1 && !ext4_es_is_delayed(es1)) {
while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es1->es_lblk > end) {
+ es1 = NULL;
+ break;
+ }
if (ext4_es_is_delayed(es1))
break;
}
@@ -284,8 +292,7 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
- trace_ext4_es_find_delayed_extent_exit(inode, es);
+ trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}
static struct extent_status *
@@ -665,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
@@ -727,7 +733,6 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
trace_ext4_es_lookup_extent_exit(inode, es, found);
return found;
}
@@ -871,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
EXTENT_STATUS_WRITTEN);
}
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct ext4_inode_info *eia, *eib;
+ eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+ eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+ if (eia->i_touch_when == eib->i_touch_when)
+ return 0;
+ if (time_after(eia->i_touch_when, eib->i_touch_when))
+ return 1;
+ else
+ return -1;
+}
+
static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
{
struct ext4_sb_info *sbi = container_of(shrink,
struct ext4_sb_info, s_es_shrinker);
struct ext4_inode_info *ei;
- struct list_head *cur, *tmp, scanned;
+ struct list_head *cur, *tmp;
+ LIST_HEAD(skiped);
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk = 0;
@@ -886,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (!nr_to_scan)
return ret;
- INIT_LIST_HEAD(&scanned);
-
spin_lock(&sbi->s_es_lru_lock);
+
+ /*
+ * If the inode that is at the head of LRU list is newer than
+ * last_sorted time, that means that we need to sort this list.
+ */
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
+ if (sbi->s_es_last_sorted < ei->i_touch_when) {
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ }
+
list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
- list_move_tail(cur, &scanned);
+ /*
+ * If we have already reclaimed all extents from extent
+ * status tree, just stop the loop immediately.
+ */
+ if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ break;
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- read_lock(&ei->i_es_lock);
- if (ei->i_es_lru_nr == 0) {
- read_unlock(&ei->i_es_lock);
+ /* Skip the inode that is newer than the last_sorted time */
+ if (sbi->s_es_last_sorted < ei->i_touch_when) {
+ list_move_tail(cur, &skiped);
continue;
}
- read_unlock(&ei->i_es_lock);
+
+ if (ei->i_es_lru_nr == 0)
+ continue;
write_lock(&ei->i_es_lock);
ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ if (ei->i_es_lru_nr == 0)
+ list_del_init(&ei->i_es_lru);
write_unlock(&ei->i_es_lock);
nr_shrunk += ret;
@@ -910,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (nr_to_scan == 0)
break;
}
- list_splice_tail(&scanned, &sbi->s_es_lru);
+
+ /* Move the newer inodes into the tail of the LRU list. */
+ list_splice_tail(&skiped, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -918,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
return ret;
}
-void ext4_es_register_shrinker(struct super_block *sb)
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
- struct ext4_sb_info *sbi;
-
- sbi = EXT4_SB(sb);
INIT_LIST_HEAD(&sbi->s_es_lru);
spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_last_sorted = 0;
sbi->s_es_shrinker.shrink = ext4_es_shrink;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&sbi->s_es_shrinker);
}
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
- unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+ unregister_shrinker(&sbi->s_es_shrinker);
}
void ext4_es_lru_add(struct inode *inode)
@@ -940,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ei->i_touch_when = jiffies;
+
+ if (!list_empty(&ei->i_es_lru))
+ return;
+
spin_lock(&sbi->s_es_lru_lock);
if (list_empty(&ei->i_es_lru))
list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
- else
- list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d8e2d4dc311e..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE)
+struct ext4_sb_info;
struct ext4_extent;
struct extent_status {
@@ -62,7 +63,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
unsigned long long status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
@@ -118,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
es->es_pblk = block;
}
-extern void ext4_es_register_shrinker(struct super_block *sb);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4959e29573b6..b19f0a457f32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -465,10 +465,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* it will be as a data.
*/
- ext4_es_find_delayed_extent(inode, last, &es);
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
}
last++;
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
} while (last <= end);
mutex_unlock(&inode->i_mutex);
@@ -540,7 +540,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -548,10 +548,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* we will skip this extent.
*/
- ext4_es_find_delayed_extent(inode, last, &es);
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
last = es.es_lblk + es.es_len;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -566,7 +566,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
&map, &holeoff);
if (!unwritten) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode: inode to sync
- * @datasync: only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking. This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly. The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
- int err;
- int ret;
-
- ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY))
- return ret;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- return ret;
-
- err = sync_inode_metadata(inode, 1);
- if (ret == 0)
- ret = err;
- return ret;
-}
-
/*
* akpm: A new design for ext4_sync_file().
*
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret, err;
+ int ret = 0, err;
tid_t commit_tid;
bool needs_barrier = false;
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext4_sync_file_enter(file, datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- if (inode->i_sb->s_flags & MS_RDONLY)
- goto out;
-
- ret = ext4_flush_unwritten_io(inode);
- if (ret < 0)
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated s_mount_flags value */
+ smp_rmb();
+ if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ret = -EROFS;
goto out;
+ }
if (!journal) {
- ret = __sync_inode(inode, datasync);
+ ret = generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
}
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!ret)
ret = err;
}
- out:
- mutex_unlock(&inode->i_mutex);
+out:
trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..f03598c6ffd3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -747,7 +747,8 @@ repeat_in_this_group:
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks);
+ handle_type, nblocks,
+ 0);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
partial--;
}
out:
- trace_ext4_ind_map_blocks_exit(inode, map, err);
+ trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
return err;
}
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
- mutex_lock(&inode->i_mutex);
- ext4_flush_unwritten_io(inode);
- mutex_unlock(&inode->i_mutex);
- }
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
{
- int indirects;
-
- /* if nrblocks are contiguous */
- if (chunk) {
- /*
- * With N contiguous data blocks, we need at most
- * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
- * 2 dindirect blocks, and 1 tindirect block
- */
- return DIV_ROUND_UP(nrblocks,
- EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
- }
/*
- * if nrblocks are not contiguous, worse case, each block touch
- * a indirect block, and each indirect block touch a double indirect
- * block, plus a triple indirect block
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
*/
- indirects = nrblocks * 2 + 1;
- return indirects;
+ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
- int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+ int flags = EXT4_FREE_BLOCKS_VALIDATED;
int err;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3e2bf873e8a8..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
entry = (struct ext4_xattr_entry *)
((void *)raw_inode + EXT4_I(inode)->i_inline_off);
- free += le32_to_cpu(entry->e_value_size);
+ free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
goto out;
}
@@ -1404,16 +1404,15 @@ out:
* offset as if '.' and '..' really take place.
*
*/
-int ext4_read_inline_dir(struct file *filp,
- void *dirent, filldir_t filldir,
+int ext4_read_inline_dir(struct file *file,
+ struct dir_context *ctx,
int *has_inline_data)
{
- int error = 0;
unsigned int offset, parent_ino;
- int i, stored;
+ int i;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
@@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp,
goto out;
sb = inode->i_sb;
- stored = 0;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
- offset = filp->f_pos;
+ offset = ctx->pos;
/*
* dotdot_offset and dotdot_size is the real offset and
@@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp,
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
extra_size = extra_offset + inline_size;
- while (!error && !stored && filp->f_pos < extra_size) {
-revalidate:
- /*
- * If the version has changed since the last call to
- * readdir(2), then we might be pointing to an invalid
- * dirent right now. Scan from the start of the inline
- * dir to make sure.
- */
- if (filp->f_version != inode->i_version) {
- for (i = 0; i < extra_size && i < offset;) {
- /*
- * "." is with offset 0 and
- * ".." is dotdot_offset.
- */
- if (!i) {
- i = dotdot_offset;
- continue;
- } else if (i == dotdot_offset) {
- i = dotdot_size;
- continue;
- }
- /* for other entry, the real offset in
- * the buf has to be tuned accordingly.
- */
- de = (struct ext4_dir_entry_2 *)
- (dir_buf + i - extra_offset);
- /* It's too expensive to do a full
- * dirent test each time round this
- * loop, but we do have to test at
- * least that it is non-zero. A
- * failure will be detected in the
- * dirent test below. */
- if (ext4_rec_len_from_disk(de->rec_len,
- extra_size) < EXT4_DIR_REC_LEN(1))
- break;
- i += ext4_rec_len_from_disk(de->rec_len,
- extra_size);
- }
- offset = i;
- filp->f_pos = offset;
- filp->f_version = inode->i_version;
- }
-
- while (!error && filp->f_pos < extra_size) {
- if (filp->f_pos == 0) {
- error = filldir(dirent, ".", 1, 0, inode->i_ino,
- DT_DIR);
- if (error)
- break;
- stored++;
- filp->f_pos = dotdot_offset;
+ /*
+ * If the version has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the inline
+ * dir to make sure.
+ */
+ if (file->f_version != inode->i_version) {
+ for (i = 0; i < extra_size && i < offset;) {
+ /*
+ * "." is with offset 0 and
+ * ".." is dotdot_offset.
+ */
+ if (!i) {
+ i = dotdot_offset;
+ continue;
+ } else if (i == dotdot_offset) {
+ i = dotdot_size;
continue;
}
+ /* for other entry, the real offset in
+ * the buf has to be tuned accordingly.
+ */
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + i - extra_offset);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+ < EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ extra_size);
+ }
+ offset = i;
+ ctx->pos = offset;
+ file->f_version = inode->i_version;
+ }
- if (filp->f_pos == dotdot_offset) {
- error = filldir(dirent, "..", 2,
- dotdot_offset,
- parent_ino, DT_DIR);
- if (error)
- break;
- stored++;
+ while (ctx->pos < extra_size) {
+ if (ctx->pos == 0) {
+ if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_offset;
+ continue;
+ }
- filp->f_pos = dotdot_size;
- continue;
- }
+ if (ctx->pos == dotdot_offset) {
+ if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_size;
+ continue;
+ }
- de = (struct ext4_dir_entry_2 *)
- (dir_buf + filp->f_pos - extra_offset);
- if (ext4_check_dir_entry(inode, filp, de,
- iloc.bh, dir_buf,
- extra_size, filp->f_pos)) {
- ret = stored;
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + ctx->pos - extra_offset);
+ if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+ extra_size, ctx->pos))
+ goto out;
+ if (le32_to_cpu(de->inode)) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
goto out;
- }
- if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored++;
- }
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
- extra_size);
}
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
}
out:
kfree(dir_buf);
@@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
if (error)
goto out;
- physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0723774bdfb5..0188e65e1f58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
/*
* Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
filemap_write_and_wait(&inode->i_data);
}
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
- unsigned int max_pages)
-{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index;
- struct pagevec pvec;
- pgoff_t num = 0;
- int i, nr_pages, done = 0;
-
- if (max_pages == 0)
- return 0;
- pagevec_init(&pvec, 0);
- while (!done) {
- index = idx;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- (pgoff_t)PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- lock_page(page);
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page) ||
- PageWriteback(page) ||
- page->index != idx) {
- done = 1;
- unlock_page(page);
- break;
- }
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- if (!buffer_delay(bh) &&
- !buffer_unwritten(bh))
- done = 1;
- bh = bh->b_this_page;
- } while (!done && (bh != head));
- }
- unlock_page(page);
- if (done)
- break;
- idx++;
- num++;
- if (num >= max_pages) {
- done = 1;
- break;
- }
- }
- pagevec_release(&pvec);
- }
- return num;
-}
-
#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
struct inode *inode,
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+ ext4_es_lru_add(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file,
}
}
- if (ext4_has_inline_data(inode))
- copied = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- else
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file,
if (i_size_changed)
ext4_mark_inode_dirty(handle, inode);
- if (copied < 0)
- ret = copied;
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
}
static void ext4_da_page_release_reservation(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
int to_release = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
int num_clusters;
ext4_fsblk_t lblk;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
+ if (next_off > stop)
+ break;
+
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
@@ -1460,145 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
- struct ext4_map_blocks *map)
-{
- struct pagevec pvec;
- unsigned long index, end;
- int ret = 0, err, nr_pages, i;
- struct inode *inode = mpd->inode;
- struct address_space *mapping = inode->i_mapping;
- loff_t size = i_size_read(inode);
- unsigned int len, block_start;
- struct buffer_head *bh, *page_bufs = NULL;
- sector_t pblock = 0, cur_logical = 0;
- struct ext4_io_submit io_submit;
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
- BUG_ON(mpd->next_page <= mpd->first_page);
- ext4_io_submit_init(&io_submit, mpd->wbc);
- io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_submit.io_end)
- return -ENOMEM;
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
/*
- * We need to start from the first_page to the next_page - 1
- * to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->b_blocknr we would only be looking
- * at the currently mapped buffer_heads.
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
*/
- index = mpd->first_page;
- end = mpd->next_page - 1;
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- int skip_page = 0;
- struct page *page = pvec.pages[i];
-
- index = page->index;
- if (index > end)
- break;
-
- if (index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- if (map) {
- cur_logical = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- pblock = map->m_pblk + (cur_logical -
- map->m_lblk);
- }
- index++;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
-
- bh = page_bufs = page_buffers(page);
- block_start = 0;
- do {
- if (map && (cur_logical >= map->m_lblk) &&
- (cur_logical <= (map->m_lblk +
- (map->m_len - 1)))) {
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock;
- }
- if (buffer_unwritten(bh) ||
- buffer_mapped(bh))
- BUG_ON(bh->b_blocknr != pblock);
- if (map->m_flags & EXT4_MAP_UNINIT)
- set_buffer_uninit(bh);
- clear_buffer_unwritten(bh);
- }
-
- /*
- * skip page if block allocation undone and
- * block is dirty
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh))
- skip_page = 1;
- bh = bh->b_this_page;
- block_start += bh->b_size;
- cur_logical++;
- pblock++;
- } while (bh != page_bufs);
-
- if (skip_page) {
- unlock_page(page);
- continue;
- }
-
- clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&io_submit, page, len,
- mpd->wbc);
- if (!err)
- mpd->pages_written++;
- /*
- * In error case, we have to continue because
- * remaining pages are still locked
- */
- if (ret == 0)
- ret = err;
- }
- pagevec_release(&pvec);
- }
- ext4_io_submit(&io_submit);
- /* Drop io_end reference we got from init */
- ext4_put_io_end_defer(io_submit.io_end);
- return ret;
-}
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
{
int nr_pages, i;
pgoff_t index, end;
struct pagevec pvec;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t start, last;
+
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
index = mpd->first_page;
end = mpd->next_page - 1;
-
- start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
pagevec_init(&pvec, 0);
while (index <= end) {
@@ -1611,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
break;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- block_invalidatepage(page, 0);
- ClearPageUptodate(page);
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
unlock_page(page);
}
index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
- return;
}
static void ext4_print_free_blocks(struct inode *inode)
@@ -1647,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
-/*
- * mpage_da_map_and_submit - go through given space, map them
- * if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
- int err, blks, get_blocks_flags;
- struct ext4_map_blocks map, *mapp = NULL;
- sector_t next = mpd->b_blocknr;
- unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
- loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
- handle_t *handle = NULL;
-
- /*
- * If the blocks are mapped already, or we couldn't accumulate
- * any blocks, then proceed immediately to the submission stage.
- */
- if ((mpd->b_size == 0) ||
- ((mpd->b_state & (1 << BH_Mapped)) &&
- !(mpd->b_state & (1 << BH_Delay)) &&
- !(mpd->b_state & (1 << BH_Unwritten))))
- goto submit_io;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
-
- /*
- * Call ext4_map_blocks() to allocate any delayed allocation
- * blocks, or to convert an uninitialized extent to be
- * initialized (in the case where we have written into
- * one or more preallocated blocks).
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
- * indicate that we are on the delayed allocation path. This
- * affects functions in many different parts of the allocation
- * call path. This flag exists primarily because we don't
- * want to change *many* call functions, so ext4_map_blocks()
- * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
- * inode's allocation semaphore is taken.
- *
- * If the blocks in questions were delalloc blocks, set
- * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
- * variables are updated after the blocks have been allocated.
- */
- map.m_lblk = next;
- map.m_len = max_blocks;
- /*
- * We're in delalloc path and it is possible that we're going to
- * need more metadata blocks than previously reserved. However
- * we must not fail because we're in writeback and there is
- * nothing we can do about it so it might result in data loss.
- * So use reserved blocks to allocate metadata if possible.
- */
- get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_METADATA_NOFAIL;
- if (ext4_should_dioread_nolock(mpd->inode))
- get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
-
- blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
- if (blks < 0) {
- struct super_block *sb = mpd->inode->i_sb;
-
- err = blks;
- /*
- * If get block returns EAGAIN or ENOSPC and there
- * appears to be free blocks we will just let
- * mpage_da_submit_io() unlock all of the pages.
- */
- if (err == -EAGAIN)
- goto submit_io;
-
- if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
- mpd->retval = err;
- goto submit_io;
- }
-
- /*
- * get block failure will cause us to loop in
- * writepages, because a_ops->writepage won't be able
- * to make progress. The page will be redirtied by
- * writepage and writepages will again try to write
- * the same.
- */
- if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
- ext4_msg(sb, KERN_CRIT,
- "delayed block allocation failed for inode %lu "
- "at logical offset %llu with max blocks %zd "
- "with error %d", mpd->inode->i_ino,
- (unsigned long long) next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost");
- if (err == -ENOSPC)
- ext4_print_free_blocks(mpd->inode);
- }
- /* invalidate all the pages */
- ext4_da_block_invalidatepages(mpd);
-
- /* Mark this page range as having been completed */
- mpd->io_done = 1;
- return;
- }
- BUG_ON(blks == 0);
-
- mapp = &map;
- if (map.m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = mpd->inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map.m_len; i++)
- unmap_underlying_metadata(bdev, map.m_pblk + i);
- }
-
- /*
- * Update on-disk size along with block allocation.
- */
- disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
- if (disksize > i_size_read(mpd->inode))
- disksize = i_size_read(mpd->inode);
- if (disksize > EXT4_I(mpd->inode)->i_disksize) {
- ext4_update_i_disksize(mpd->inode, disksize);
- err = ext4_mark_inode_dirty(handle, mpd->inode);
- if (err)
- ext4_error(mpd->inode->i_sb,
- "Failed to mark inode %lu dirty",
- mpd->inode->i_ino);
- }
-
-submit_io:
- mpage_da_submit_io(mpd, mapp);
- mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
- (1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
- unsigned long b_state)
-{
- sector_t next;
- int blkbits = mpd->inode->i_blkbits;
- int nrblocks = mpd->b_size >> blkbits;
-
- /*
- * XXX Don't go larger than mballoc is willing to allocate
- * This is a stopgap solution. We eventually need to fold
- * mpage_da_submit_io() into this function and then call
- * ext4_map_blocks() multiple times in a loop
- */
- if (nrblocks >= (8*1024*1024 >> blkbits))
- goto flush_it;
-
- /* check if the reserved journal credits might overflow */
- if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
- if (nrblocks >= EXT4_MAX_TRANS_DATA) {
- /*
- * With non-extent format we are limited by the journal
- * credit available. Total credit needed to insert
- * nrblocks contiguous blocks is dependent on the
- * nrblocks. So limit nrblocks.
- */
- goto flush_it;
- }
- }
- /*
- * First block in the extent
- */
- if (mpd->b_size == 0) {
- mpd->b_blocknr = logical;
- mpd->b_size = 1 << blkbits;
- mpd->b_state = b_state & BH_FLAGS;
- return;
- }
-
- next = mpd->b_blocknr + nrblocks;
- /*
- * Can we merge the block to our big extent?
- */
- if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += 1 << blkbits;
- return;
- }
-
-flush_it:
- /*
- * We couldn't merge the block to our extent, so we
- * need to flush current extent and start new one
- */
- mpage_da_map_and_submit(mpd);
- return;
-}
-
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
{
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1888,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
+ ext4_es_lru_add(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, &es)) {
@@ -2161,7 +1804,7 @@ out:
* lock so we have to do some magic.
*
* This function can get called via...
- * - ext4_da_writepages after taking page lock (have journal handle)
+ * - ext4_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
* - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
@@ -2243,6 +1886,7 @@ static int ext4_writepage(struct page *page,
io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_submit.io_end) {
redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
return -ENOMEM;
}
ret = ext4_bio_write_page(&io_submit, page, len, wbc);
@@ -2252,70 +1896,391 @@ static int ext4_writepage(struct page *page,
return ret;
}
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
/*
- * This is called via ext4_da_writepages() to
- * calculate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
*/
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ unsigned long b_state)
+{
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return 0;
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = b_state & BH_FLAGS;
+ return 1;
+ }
+
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return 1;
+ }
+ return 0;
+}
+
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+ lblk >= blocks) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return false;
+ if (lblk >= blocks)
+ return true;
+ continue;
+ }
+ if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+ return false;
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Upto 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ add_page_bufs_to_extent(mpd, head, bh,
+ lblk);
+ pagevec_release(&pvec);
+ return 0;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (++lblk < blocks &&
+ (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
{
- int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err;
+ trace_ext4_da_write_pages_extent(inode, map);
/*
- * With non-extent format the journal credit needed to
- * insert nrblocks contiguous block is dependent on
- * number of contiguous block. So we will limit
- * number of contiguous block to a sane value
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an uninitialized extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- (max_blocks > EXT4_MAX_TRANS_DATA))
- max_blocks = EXT4_MAX_TRANS_DATA;
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ if (ext4_should_dioread_nolock(inode))
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (map->m_flags & EXT4_MAP_UNINIT) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
- return ext4_chunk_trans_blocks(inode, max_blocks);
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
+
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
}
/*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
*/
-static int write_cache_pages_da(handle_t *handle,
- struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd,
- pgoff_t *done_index)
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
{
- struct buffer_head *bh, *head;
- struct inode *inode = mapping->host;
- struct pagevec pvec;
- unsigned int nr_pages;
- sector_t logical;
- pgoff_t index, end;
- long nr_to_write = wbc->nr_to_write;
- int i, tag, ret = 0;
-
- memset(mpd, 0, sizeof(struct mpage_da_data));
- mpd->wbc = wbc;
- mpd->inode = inode;
- pagevec_init(&pvec, 0);
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ while (map->m_len) {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ }
+
+ /* Update on-disk size after IO is submitted */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+
+ ext4_update_i_disksize(inode, disksize);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct pagevec pvec;
+ unsigned int nr_pages;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
+ int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
+
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
- *done_index = index;
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
while (index <= end) {
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
- return 0;
+ goto out;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2330,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
if (page->index > end)
goto out;
- *done_index = page->index + 1;
-
- /*
- * If we can't merge this page, and we have
- * accumulated an contiguous region, write it
- */
- if ((mpd->next_page != page->index) &&
- (mpd->next_page != mpd->first_page)) {
- mpage_da_map_and_submit(mpd);
- goto ret_extent_tail;
- }
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
lock_page(page);
-
/*
- * If the page is no longer dirty, or its
- * mapping no longer corresponds to inode we
- * are writing (which means it has been
- * truncated or invalidated), or the page is
- * already under writeback and we are not
- * doing a data integrity writeback, skip the page
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
*/
if (!PageDirty(page) ||
(PageWriteback(page) &&
- (wbc->sync_mode == WB_SYNC_NONE)) ||
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
@@ -2363,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
- /*
- * If we have inline data and arrive here, it means that
- * we will soon create the block for the 1st page, so
- * we'd better clear the inline data here.
- */
- if (ext4_has_inline_data(inode)) {
- BUG_ON(ext4_test_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA));
- ext4_destroy_inline_data(handle, inode);
- }
-
- if (mpd->next_page != page->index)
+ if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
- logical = (sector_t) page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
/* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
- /*
- * We need to try to allocate unmapped blocks
- * in the same page. Otherwise we won't make
- * progress with the page in ext4_writepage
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_state);
- if (mpd->io_done)
- goto ret_extent_tail;
- } else if (buffer_dirty(bh) &&
- buffer_mapped(bh)) {
- /*
- * mapped dirty buffer. We need to
- * update the b_state because we look
- * at b_state in mpage_da_map_blocks.
- * We don't update b_size because if we
- * find an unmapped buffer_head later
- * we need to use the b_state flag of
- * that buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state =
- bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
-
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE)
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
+ if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+ goto out;
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, page);
+ if (err < 0)
goto out;
}
+
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+ mpd->next_page - mpd->first_page >=
+ mpd->wbc->nr_to_write)
+ goto out;
}
pagevec_release(&pvec);
cond_resched();
}
return 0;
-ret_extent_tail:
- ret = MPAGE_DA_EXTENT_TAIL;
out:
pagevec_release(&pvec);
- cond_resched();
- return ret;
+ return err;
}
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- pgoff_t index;
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int pages_written = 0;
- unsigned int max_pages;
- int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0;
- long desired_nr_to_write, nr_to_writebump = 0;
- loff_t range_start = wbc->range_start;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- pgoff_t done_index = 0;
- pgoff_t end;
+ bool done;
struct blk_plug plug;
+ bool give_up_on_write = false;
- trace_ext4_da_writepages(inode, wbc);
+ trace_ext4_writepages(inode, wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2472,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping,
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
+ int ret;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
+ }
+
/*
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
* the latter could be true if the filesystem is mounted
- * read-only, and in that case, ext4_da_writepages should
+ * read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert upto one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
+
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- range_cyclic = wbc->range_cyclic;
if (wbc->range_cyclic) {
- index = mapping->writeback_index;
- if (index)
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
cycled = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = LLONG_MAX;
- wbc->range_cyclic = 0;
- end = -1;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- }
-
- /*
- * This works around two forms of stupidity. The first is in
- * the writeback code, which caps the maximum number of pages
- * written to be 1024 pages. This is wrong on multiple
- * levels; different architectues have a different page size,
- * which changes the maximum amount of data which gets
- * written. Secondly, 4 megabytes is way too small. XFS
- * forces this value to be 16 megabytes by multiplying
- * nr_to_write parameter by four, and then relies on its
- * allocator to allocate larger extents to make them
- * contiguous. Unfortunately this brings us to the second
- * stupidity, which is that ext4's mballoc code only allocates
- * at most 2048 blocks. So we force contiguous writes up to
- * the number of dirty blocks in the inode, or
- * sbi->max_writeback_mb_bump whichever is smaller.
- */
- max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
- if (!range_cyclic && range_whole) {
- if (wbc->nr_to_write == LONG_MAX)
- desired_nr_to_write = wbc->nr_to_write;
- else
- desired_nr_to_write = wbc->nr_to_write * 8;
- } else
- desired_nr_to_write = ext4_num_dirty_pages(inode, index,
- max_pages);
- if (desired_nr_to_write > max_pages)
- desired_nr_to_write = max_pages;
-
- if (wbc->nr_to_write < desired_nr_to_write) {
- nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
- wbc->nr_to_write = desired_nr_to_write;
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
}
+ mpd.inode = inode;
+ mpd.wbc = wbc;
+ ext4_io_submit_init(&mpd.io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
-
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
blk_start_plug(&plug);
- while (!ret && wbc->nr_to_write > 0) {
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
/*
- * we insert one extent at a time. So we need
- * credit needed for single extent allocation.
- * journalled mode is currently not supported
- * by delalloc
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
needed_blocks = ext4_da_writepages_trans_blocks(inode);
- /* start a new transaction*/
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- needed_blocks);
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- blk_finish_plug(&plug);
- goto out_writepages;
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
}
- /*
- * Now call write_cache_pages_da() to find the next
- * contiguous region of logical blocks that need
- * blocks to be allocated by ext4 and submit them.
- */
- ret = write_cache_pages_da(handle, mapping,
- wbc, &mpd, &done_index);
- /*
- * If we have a contiguous extent of pages and we
- * haven't done the I/O yet, map the blocks and submit
- * them for I/O.
- */
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- mpage_da_map_and_submit(&mpd);
- ret = MPAGE_DA_EXTENT_TAIL;
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
}
- trace_ext4_da_write_pages(inode, &mpd);
- wbc->nr_to_write -= mpd.pages_written;
-
ext4_journal_stop(handle);
-
- if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
- /* commit the transaction which would
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
ret = 0;
- } else if (ret == MPAGE_DA_EXTENT_TAIL) {
- /*
- * Got one extent now try with rest of the pages.
- * If mpd.retval is set -EIO, journal is aborted.
- * So we don't need to write any more.
- */
- pages_written += mpd.pages_written;
- ret = mpd.retval;
- io_done = 1;
- } else if (wbc->nr_to_write)
- /*
- * There is no more writeout needed
- * or we requested for a noblocking writeout
- * and we found the device congested
- */
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
break;
}
blk_finish_plug(&plug);
- if (!io_done && !cycled) {
+ if (!ret && !cycled) {
cycled = 1;
- index = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = mapping->writeback_index - 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
goto retry;
}
/* Update index */
- wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
- * set the writeback_index so that range_cyclic
+ * Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = done_index;
+ mapping->writeback_index = mpd.first_page;
out_writepages:
- wbc->nr_to_write -= nr_to_writebump;
- wbc->range_start = range_start;
- trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
return ret;
}
@@ -2841,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* Drop reserved blocks
@@ -2850,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
if (!page_has_buffers(page))
goto out;
- ext4_da_page_release_reservation(page, offset);
+ ext4_da_page_release_reservation(page, offset, length);
out:
- ext4_invalidatepage(page, offset);
+ ext4_invalidatepage(page, offset, length);
return;
}
@@ -2876,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* laptop_mode, not even desirable). However, to do otherwise
* would require replicating code paths in:
*
- * ext4_da_writepages() ->
+ * ext4_writepages() ->
* write_cache_pages() ---> (via passed in callback function)
* __mpage_da_writepage() -->
* mpage_add_bh_to_extent()
@@ -3001,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- trace_ext4_invalidatepage(page, offset);
+ trace_ext4_invalidatepage(page, offset, length);
/* No journalling happens on data buffers when this function is used */
WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
- block_invalidatepage(page, offset);
+ block_invalidatepage(page, offset, length);
}
static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- return jbd2_journal_invalidatepage(journal, page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
}
/* Wrapper for aops... */
static void ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3141,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
BUG_ON(iocb->private == NULL);
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
+
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
if (overwrite) {
- atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3216,11 +3147,19 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ext4_inode_aio_set(inode, NULL);
ext4_put_io_end(io_end);
/*
- * In case of error or no write ext4_end_io_dio() was not
+ * When no IO was submitted ext4_end_io_dio() was not
* called so we have to put iocb's reference.
*/
- if (ret <= 0 && ret != -EIOCBQUEUED) {
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->iocb);
+ /*
+ * Generic code already did inode_dio_done() so we
+ * have to clear EXT4_IO_END_DIRECT to not do it for
+ * the second time.
+ */
+ io_end->flag = 0;
ext4_put_io_end(io_end);
iocb->private = NULL;
}
@@ -3232,7 +3171,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* for non AIO case, since the IO is already
* completed, we could do the conversion right here
*/
- err = ext4_convert_unwritten_extents(inode,
+ err = ext4_convert_unwritten_extents(NULL, inode,
offset, ret);
if (err < 0)
ret = err;
@@ -3240,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
}
retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
- inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
@@ -3301,6 +3241,7 @@ static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
.bmap = ext4_bmap,
@@ -3316,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3331,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .writepages = ext4_da_writepages,
+ .writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
.bmap = ext4_bmap,
@@ -3364,89 +3306,56 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
-
/*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags)
+int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
struct inode *inode = mapping->host;
- struct page *page;
- int err = 0;
-
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (!page)
- return -ENOMEM;
- err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
- from, length, flags);
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
- unlock_page(page);
- page_cache_release(page);
- return err;
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
/*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped. Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been locked. The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'. This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode: The files inode
- * page: A locked page that contains the offset "from"
- * from: The starting byte offset (from the beginning of the file)
- * to begin discarding
- * len: The length of bytes to discard
- * flags: Optional flags that may be used:
- *
- * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- * Only zero the regions of the page whose buffer heads
- * have already been unmapped. This flag is appropriate
- * for updating the contents of a page whose blocks may
- * have already been released, and we only want to zero
- * out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags)
+int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned int offset = from & (PAGE_CACHE_SIZE-1);
- unsigned int blocksize, max, pos;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
+ struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
- blocksize = inode->i_sb->s_blocksize;
- max = PAGE_CACHE_SIZE - offset;
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
- if (index != page->index)
- return -EINVAL;
+ blocksize = inode->i_sb->s_blocksize;
+ max = blocksize - (offset & (blocksize - 1));
/*
* correct length if it does not fall between
- * 'from' and the end of the page
+ * 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;
@@ -3464,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
iblock++;
pos += blocksize;
}
-
- pos = offset;
- while (pos < offset + length) {
- unsigned int end_of_block, range_to_discard;
-
- err = 0;
-
- /* The length of space left to zero and unmap */
- range_to_discard = offset + length - pos;
-
- /* The length of space until the end of the block */
- end_of_block = blocksize - (pos & (blocksize-1));
-
- /*
- * Do not unmap or zero past end of block
- * for this buffer head
- */
- if (range_to_discard > end_of_block)
- range_to_discard = end_of_block;
-
-
- /*
- * Skip this buffer head if we are only zeroing unampped
- * regions of the page
- */
- if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
- buffer_mapped(bh))
- goto next;
-
- /* If the range is block aligned, unmap */
- if (range_to_discard == blocksize) {
- clear_buffer_dirty(bh);
- bh->b_bdev = NULL;
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
- clear_buffer_uptodate(bh);
- zero_user(page, pos, range_to_discard);
- BUFFER_TRACE(bh, "Buffer discarded");
- goto next;
- }
-
- /*
- * If this block is not completely contained in the range
- * to be discarded, then it is not going to be released. Because
- * we need to keep this block, we need to make sure this part
- * of the page is uptodate before we modify it by writeing
- * partial zeros on it.
- */
+ if (buffer_freed(bh)) {
+ BUFFER_TRACE(bh, "freed: skip");
+ goto unlock;
+ }
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "unmapped");
+ ext4_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
if (!buffer_mapped(bh)) {
- /*
- * Buffer head must be mapped before we can read
- * from the block
- */
- BUFFER_TRACE(bh, "unmapped");
- ext4_get_block(inode, iblock, bh, 0);
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "still unmapped");
- goto next;
- }
+ BUFFER_TRACE(bh, "still unmapped");
+ goto unlock;
}
+ }
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt.*/
- if (!buffer_uptodate(bh))
- goto next;
- }
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+ if (ext4_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto unlock;
+ }
+ zero_user(page, offset, length);
+ BUFFER_TRACE(bh, "zeroed end of block");
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err)
- goto next;
- }
+ if (ext4_should_journal_data(inode)) {
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ } else {
+ err = 0;
+ mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
+ }
- zero_user(page, pos, range_to_discard);
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
- err = 0;
- if (ext4_should_journal_data(inode)) {
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- } else
- mark_buffer_dirty(bh);
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
- BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
- bh = bh->b_this_page;
- iblock++;
- pos += range_to_discard;
- }
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
+
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
+ }
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
+ }
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
return err;
}
@@ -3589,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
{
- struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
+ loff_t first_block_offset, last_block_offset;
handle_t *handle;
unsigned int credits;
int ret = 0;
@@ -3647,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
offset;
}
- first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
- first_page_offset = first_page << PAGE_CACHE_SHIFT;
- last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
- /* Now release the pages */
- if (last_page_offset > first_page_offset) {
- truncate_pagecache_range(inode, first_page_offset,
- last_page_offset - 1);
- }
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
- ret = ext4_flush_unwritten_io(inode);
- if (ret)
- goto out_dio;
inode_dio_wait(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3677,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out_dio;
}
- /*
- * Now we need to zero out the non-page-aligned data in the
- * pages at the start and tail of the hole, and unmap the
- * buffer heads for the block aligned regions of the page that
- * were completely zeroed.
- */
- if (first_page > last_page) {
- /*
- * If the file space being truncated is contained
- * within a page just zero out and unmap the middle of
- * that page
- */
- ret = ext4_discard_partial_page_buffers(handle,
- mapping, offset, length, 0);
-
- if (ret)
- goto out_stop;
- } else {
- /*
- * zero out and unmap the partial page that contains
- * the start of the hole
- */
- page_len = first_page_offset - offset;
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle, mapping,
- offset, page_len, 0);
- if (ret)
- goto out_stop;
- }
-
- /*
- * zero out and unmap the partial page that contains
- * the end of the hole
- */
- page_len = offset + length - last_page_offset;
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle, mapping,
- last_page_offset, page_len, 0);
- if (ret)
- goto out_stop;
- }
- }
-
- /*
- * If i_size is contained in the last page, we need to
- * unmap and zero the partial page after i_size
- */
- if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
- inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (ret)
- goto out_stop;
- }
- }
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
first_block = (offset + sb->s_blocksize - 1) >>
EXT4_BLOCK_SIZE_BITS(sb);
@@ -3812,7 +3641,6 @@ void ext4_truncate(struct inode *inode)
unsigned int credits;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
- loff_t page_len;
/*
* There is a possibility that we're either freeing the inode
@@ -3839,12 +3667,6 @@ void ext4_truncate(struct inode *inode)
return;
}
- /*
- * finish any pending end_io work so we won't run the risk of
- * converting any truncated blocks to initialized later
- */
- ext4_flush_unwritten_io(inode);
-
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
else
@@ -3856,14 +3678,8 @@ void ext4_truncate(struct inode *inode)
return;
}
- if (inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0))
- goto out_stop;
- }
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
/*
* We add the inode to the orphan list, so that if this
@@ -4632,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
inode->i_size >> PAGE_CACHE_SHIFT);
if (!page)
return;
- ret = __ext4_journalled_invalidatepage(page, offset);
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
unlock_page(page);
page_cache_release(page);
if (ret != -EBUSY)
@@ -4814,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode;
- unsigned long delalloc_blocks;
+ unsigned long long delalloc_blocks;
inode = dentry->d_inode;
generic_fillattr(inode, stat);
@@ -4832,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
EXT4_I(inode)->i_reserved_data_blocks);
- stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
return 0;
}
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return ext4_ind_trans_blocks(inode, nrblocks, chunk);
- return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
}
/*
@@ -4854,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -4862,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
int ret = 0;
/*
- * How many index blocks need to touch to modify nrblocks?
- * The "Chunk" flag indicating whether the nrblocks is
- * physically contiguous on disk
- *
- * For Direct IO and fallocate, they calls get_block to allocate
- * one single extent at a time, so they could set the "Chunk" flag
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
*/
- idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
ret = idxblocks;
@@ -4877,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks;
- if (chunk)
- groups += 1;
- else
- groups += nrblocks;
-
+ groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -4913,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, 0);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b1ed9e07434b..a9ff5e5137ca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,7 +2105,12 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- if (group == ngroups)
+ cond_resched();
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ if (group >= ngroups)
group = 0;
/* This now checks without needing the buddy page */
@@ -4401,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp) {
- ext4_discard_allocated_blocks(ac);
- goto errout;
- }
+ if (*errp)
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4608,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(bh && (count > 1));
for (i = 0; i < count; i++) {
+ cond_resched();
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- if (unlikely(!tbh))
+ if (!tbh)
continue;
ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
- long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
- offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-
/* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..ab2f6dc44b3a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
bh->b_data, bh->b_size,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ ((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse(bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 19599bded62a..48786cdb5e6c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -46,29 +46,82 @@ void ext4_exit_pageio(void)
}
/*
- * This function is called by ext4_evict_inode() to make sure there is
- * no more pending I/O completion work left to do.
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
*/
-void ext4_ioend_shutdown(struct inode *inode)
+static void buffer_io_error(struct buffer_head *bh)
{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
+ char b[BDEVNAME_SIZE];
+ printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ bdevname(bh->b_bdev, b),
+ (unsigned long long)bh->b_blocknr);
+}
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
- /*
- * We need to make sure the work structure is finished being
- * used before we let the inode get destroyed.
- */
- if (work_pending(&EXT4_I(inode)->i_unwritten_work))
- cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
+static void ext4_finish_bio(struct bio *bio)
+{
+ int i;
+ int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ struct bio_vec *bvec = &bio->bi_io_vec[i];
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh, *head;
+ unsigned bio_start = bvec->bv_offset;
+ unsigned bio_end = bio_start + bvec->bv_len;
+ unsigned under_io = 0;
+ unsigned long flags;
+
+ if (!page)
+ continue;
+
+ if (error) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ bh = head = page_buffers(page);
+ /*
+ * We check all buffers in the page under BH_Uptodate_Lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + bh->b_size > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (error)
+ buffer_io_error(bh);
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+ if (!under_io)
+ end_page_writeback(page);
+ }
}
static void ext4_release_io_end(ext4_io_end_t *io_end)
{
+ struct bio *bio, *next_bio;
+
BUG_ON(!list_empty(&io_end->list));
BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->handle);
if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
wake_up_all(ext4_ioend_wq(io_end->inode));
+
+ for (bio = io_end->bio; bio; bio = next_bio) {
+ next_bio = bio->bi_private;
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
if (io_end->flag & EXT4_IO_END_DIRECT)
inode_dio_done(io_end->inode);
if (io_end->iocb)
@@ -86,19 +139,28 @@ static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
wake_up_all(ext4_ioend_wq(inode));
}
-/* check a range of space and convert unwritten extents to written. */
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
+ handle_t *handle = io->handle;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
- ret = ext4_convert_unwritten_extents(inode, offset, size);
+ io->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
if (ret < 0) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
@@ -111,20 +173,17 @@ static int ext4_end_io(ext4_io_end_t *io)
return ret;
}
-static void dump_completed_IO(struct inode *inode)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
- ext4_debug("inode %lu completed_io list is empty\n",
- inode->i_ino);
+ if (list_empty(head))
return;
- }
- ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+ list_for_each_entry(io, head, list) {
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
@@ -145,16 +204,23 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
unsigned long flags;
BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (list_empty(&ei->i_completed_io_list))
- queue_work(wq, &ei->i_unwritten_work);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ if (io_end->handle) {
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+ } else {
+ wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+ if (list_empty(&ei->i_unrsv_conversion_list))
+ queue_work(wq, &ei->i_unrsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
+ }
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
-static int ext4_do_flush_completed_IO(struct inode *inode)
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ struct list_head *head)
{
ext4_io_end_t *io;
struct list_head unwritten;
@@ -163,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
int err, ret = 0;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- dump_completed_IO(inode);
- list_replace_init(&ei->i_completed_io_list, &unwritten);
+ dump_completed_IO(inode, head);
+ list_replace_init(head, &unwritten);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
@@ -180,23 +246,20 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
}
/*
- * work on completed aio dio IO, to convert unwritten extents to extents
+ * work on completed IO, to convert unwritten extents to extents
*/
-void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_rsv_work(struct work_struct *work)
{
struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unwritten_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode);
+ i_rsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-int ext4_flush_unwritten_io(struct inode *inode)
+void ext4_end_io_unrsv_work(struct work_struct *work)
{
- int ret;
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
- !(inode->i_state & I_FREEING));
- ret = ext4_do_flush_completed_IO(inode);
- ext4_unwritten_wait(inode);
- return ret;
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_unrsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -228,8 +291,10 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_extents(io_end->inode,
- io_end->offset, io_end->size);
+ err = ext4_convert_unwritten_extents(io_end->handle,
+ io_end->inode, io_end->offset,
+ io_end->size);
+ io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
ext4_release_io_end(io_end);
@@ -243,79 +308,31 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
return io_end;
}
-/*
- * Print an buffer I/O error compatible with the fs/buffer.c. This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message. We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
-{
- char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
-}
-
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct inode *inode;
- int i;
- int blocksize;
sector_t bi_sector = bio->bi_sector;
BUG_ON(!io_end);
- inode = io_end->inode;
- blocksize = 1 << inode->i_blkbits;
- bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct bio_vec *bvec = &bio->bi_io_vec[i];
- struct page *page = bvec->bv_page;
- struct buffer_head *bh, *head;
- unsigned bio_start = bvec->bv_offset;
- unsigned bio_end = bio_start + bvec->bv_len;
- unsigned under_io = 0;
- unsigned long flags;
- if (!page)
- continue;
-
- if (error) {
- SetPageError(page);
- set_bit(AS_EIO, &page->mapping->flags);
- }
- bh = head = page_buffers(page);
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
/*
- * We check all buffers in the page under BH_Uptodate_Lock
- * to avoid races with other end io clearing async_write flags
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
- do {
- if (bh_offset(bh) < bio_start ||
- bh_offset(bh) + blocksize > bio_end) {
- if (buffer_async_write(bh))
- under_io++;
- continue;
- }
- clear_buffer_async_write(bh);
- if (error)
- buffer_io_error(bh);
- } while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
- if (!under_io)
- end_page_writeback(page);
+ bio->bi_private = xchg(&io_end->bio, bio);
+ } else {
+ ext4_finish_bio(bio);
+ bio_put(bio);
}
- bio_put(bio);
if (error) {
- io_end->flag |= EXT4_IO_END_ERROR;
+ struct inode *inode = io_end->inode;
+
ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
inode->i_ino,
@@ -324,7 +341,6 @@ static void ext4_end_bio(struct bio *bio, int error)
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
}
-
ext4_put_io_end_defer(io_end);
}
@@ -356,13 +372,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
struct bio *bio;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ if (!bio)
+ return -ENOMEM;
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
- if (!io->io_end->size)
- io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
- + bh_offset(bh);
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
return 0;
@@ -372,7 +387,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
int ret;
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -387,10 +401,6 @@ submit_and_retry:
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
- io_end = io->io_end;
- if (test_clear_buffer_uninit(bh))
- ext4_set_io_unwritten_flag(inode, io_end);
- io_end->size += bh->b_size;
io->io_next_block++;
return 0;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_group_overhead_blocks(sb, group);
- ext4_fsblk_t metaend = start + overhead;
+ unsigned overhead;
+ ext4_fsblk_t metaend;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
int err = -EINVAL;
+ if (group != sbi->s_groups_count) {
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+ input->group, sbi->s_groups_count);
+ return -EINVAL;
+ }
+
+ overhead = ext4_group_overhead_blocks(sb, group);
+ metaend = start + overhead;
input->free_blocks_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
free_blocks_count, input->reserved_blocks);
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
- if (group != sbi->s_groups_count)
- ext4_warning(sb, "Cannot add at group %u (only %u groups)",
- input->group, sbi->s_groups_count);
- else if (offset != 0)
+ if (offset != 0)
ext4_warning(sb, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
struct inode *inode = NULL;
- int gdb_off, gdb_num;
+ int gdb_off;
int err;
__u16 bg_flags = 0;
- gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
err = err2;
if (!err) {
- ext4_fsblk_t first_block;
- first_block = ext4_group_first_block_no(sb, 0);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..85b3dd60169b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
}
if (test_opt(sb, ERRORS_RO)) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
ext4_handle_error(sb);
}
-void ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_inode(struct inode *inode, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
ext4_handle_error(inode->i_sb);
}
-void ext4_error_file(struct file *file, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_file(struct file *file, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
if ((sb->s_flags & MS_RDONLY) == 0) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
panic("EXT4-fs panic from previous error\n");
}
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+ const char *prefix, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->dio_unwritten_wq);
- destroy_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ destroy_workqueue(sbi->unrsv_conversion_wq);
+ destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
- ext4_es_unregister_shrinker(sb);
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_lru);
ei->i_es_lru_nr = 0;
+ ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_quota = 0;
#endif
ei->jinode = NULL;
- INIT_LIST_HEAD(&ei->i_completed_io_list);
+ INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
- INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
+ INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
.dirty_inode = ext4_dirty_inode,
.drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
+ .sync_fs = ext4_sync_fs_nojournal,
.put_super = ext4_put_super,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
@@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
ext4_group_t flex_group;
- unsigned int groups_per_flex = 0;
int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
sbi->s_log_groups_per_flex = 0;
return 1;
}
- groups_per_flex = 1U << sbi->s_log_groups_per_flex;
err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
if (err)
@@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
dquot_initialize(inode);
if (inode->i_nlink) {
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
mutex_lock(&inode->i_mutex);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
nr_orphans++;
@@ -2377,7 +2396,10 @@ struct ext4_attr {
ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
const char *, size_t);
- int offset;
+ union {
+ int offset;
+ int deprecated_val;
+ } u;
};
static int parse_strtoull(const char *buf,
@@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
static ssize_t sbi_ui_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
}
@@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
unsigned long t;
int ret;
@@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
return count;
}
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
- .offset = offsetof(struct ext4_sb_info, _elname), \
+ .u = { \
+ .offset = offsetof(struct ext4_sb_info, _elname),\
+ }, \
}
#define EXT4_ATTR(name, mode, show, store) \
static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
#define EXT4_RW_ATTR_SBI_UI(name, elname) \
EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = sbi_deprecated_show, \
+ .u = { \
+ .deprecated_val = _val, \
+ }, \
+}
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
@@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
- ext4_es_register_shrinker(sb);
+ ext4_es_register_shrinker(sbi);
err = percpu_counter_init(&sbi->s_freeclusters_counter,
ext4_count_free_clusters(sb));
@@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
- sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
/*
@@ -3915,12 +3952,20 @@ no_journal:
* The maximum number of concurrent works can be high and
* concurrency isn't really necessary. Limit it to 1.
*/
- EXT4_SB(sb)->dio_unwritten_wq =
- alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->dio_unwritten_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ EXT4_SB(sb)->rsv_conversion_wq =
+ alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->rsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
ret = -ENOMEM;
- goto failed_mount_wq;
+ goto failed_mount4;
+ }
+
+ EXT4_SB(sb)->unrsv_conversion_wq =
+ alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+ ret = -ENOMEM;
+ goto failed_mount4;
}
/*
@@ -4074,14 +4119,17 @@ failed_mount4a:
sb->s_root = NULL;
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
- destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+ if (EXT4_SB(sb)->rsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ if (EXT4_SB(sb)->unrsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3:
- ext4_es_unregister_shrinker(sb);
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- flush_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
*/
dquot_writeback_dquots(sb, -1);
+ /*
+ * Data writeback is possible w/o journal transaction, so barrier must
+ * being sent at the end of the function. But we can skip it if
+ * transaction_commit will do it for us.
+ */
+ target = jbd2_get_latest_transaction(sbi->s_journal);
+ if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ needs_barrier = true;
+
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(sbi->s_journal, target);
+ ret = jbd2_log_wait_commit(sbi->s_journal, target);
+ }
+ if (needs_barrier) {
+ int err;
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
}
+
+ return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+ int ret = 0;
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+ dquot_writeback_dquots(sb, -1);
+ if (wait && test_opt(sb, BARRIER))
+ ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
return ret;
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
+
+config F2FS_FS_SECURITY
+ bool "F2FS Security Labels"
+ depends on F2FS_FS_XATTR
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the f2fs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f286e0..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
}
}
- error = f2fs_setxattr(inode, name_index, "", value, size);
+ error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
kfree(value);
if (!error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01da1a40..66a6b85a51d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -357,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
unsigned long blk_size = sbi->blocksize;
struct f2fs_checkpoint *cp_block;
unsigned long long cur_version = 0, pre_version = 0;
- unsigned int crc = 0;
size_t crc_offset;
+ __u32 crc = 0;
/* Read the 1st cp block in this CP pack */
cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp1;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp1;
@@ -384,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp2;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp2;
@@ -450,13 +450,30 @@ fail_no_cp:
return -EINVAL;
}
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct list_head *head = &sbi->dir_inode_list;
- struct dir_inode_entry *new;
struct list_head *this;
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode)
+ return -EEXIST;
+ }
+ list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+ sbi->n_dirty_dirs++;
+#endif
+ return 0;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+
if (!S_ISDIR(inode->i_mode))
return;
retry:
@@ -469,23 +486,31 @@ retry:
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
- if (entry->inode == inode) {
- kmem_cache_free(inode_entry_slab, new);
- goto out;
- }
- }
- list_add_tail(&new->list, head);
- sbi->n_dirty_dirs++;
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
- BUG_ON(!S_ISDIR(inode->i_mode));
-out:
inc_page_count(sbi, F2FS_DIRTY_DENTS);
inode_inc_dirty_dents(inode);
SetPagePrivate(page);
+ spin_unlock(&sbi->dir_inode_lock);
+}
+void add_dirty_dir_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+retry:
+ new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->inode = inode;
+ INIT_LIST_HEAD(&new->list);
+
+ spin_lock(&sbi->dir_inode_lock);
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
spin_unlock(&sbi->dir_inode_lock);
}
@@ -499,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
return;
spin_lock(&sbi->dir_inode_lock);
- if (atomic_read(&F2FS_I(inode)->dirty_dents))
- goto out;
+ if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+ spin_unlock(&sbi->dir_inode_lock);
+ return;
+ }
list_for_each(this, head) {
struct dir_inode_entry *entry;
@@ -508,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
if (entry->inode == inode) {
list_del(&entry->list);
kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->n_dirty_dirs--;
+#endif
+ break;
+ }
+ }
+ spin_unlock(&sbi->dir_inode_lock);
+
+ /* Only from the recovery routine */
+ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+ clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+ iput(inode);
+ }
+}
+
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *head = &sbi->dir_inode_list;
+ struct list_head *this;
+ struct inode *inode = NULL;
+
+ spin_lock(&sbi->dir_inode_lock);
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode->i_ino == ino) {
+ inode = entry->inode;
break;
}
}
-out:
spin_unlock(&sbi->dir_inode_lock);
+ return inode;
}
void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
- unsigned int crc32 = 0;
+ __u32 crc32 = 0;
void *kaddr;
int i;
@@ -664,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
- *(__le32 *)((unsigned char *)ckpt +
- le32_to_cpu(ckpt->checksum_offset))
+ *((__le32 *)((unsigned char *)ckpt +
+ le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
start_blk = __start_cp_addr(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..035f9a345cdf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
struct buffer_head *bh_result)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
pgoff_t start_fofs, end_fofs;
block_t start_blkaddr;
@@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
return 0;
}
+#ifdef CONFIG_F2FS_STAT_FS
sbi->total_hit_ext++;
+#endif
start_fofs = fi->ext.fofs;
end_fofs = fi->ext.fofs + fi->ext.len - 1;
start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
else
bh_result->b_size = UINT_MAX;
+#ifdef CONFIG_F2FS_STAT_FS
sbi->read_hit_ext++;
+#endif
read_unlock(&fi->ext.ext_lock);
return 1;
}
@@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
if (dn.data_blkaddr == NEW_ADDR)
return ERR_PTR(-EINVAL);
- page = grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
struct page *page;
int err;
+repeat:
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
+ }
f2fs_put_dnode(&dn);
- if (dn.data_blkaddr == NULL_ADDR)
+ if (dn.data_blkaddr == NULL_ADDR) {
+ f2fs_put_page(page, 1);
return ERR_PTR(-ENOENT);
-repeat:
- page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ }
if (PageUptodate(page))
return page;
@@ -274,9 +285,10 @@ repeat:
*
* Also, caller should grab and release a mutex by calling mutex_lock_op() and
* mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
*/
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
- bool new_i_size)
+struct page *get_new_data_page(struct inode *inode,
+ struct page *npage, pgoff_t index, bool new_i_size)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
int err;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ set_new_dnode(&dn, inode, npage, npage, 0);
err = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (err)
return ERR_PTR(err);
if (dn.data_blkaddr == NULL_ADDR) {
if (reserve_new_block(&dn)) {
- f2fs_put_dnode(&dn);
+ if (!npage)
+ f2fs_put_dnode(&dn);
return ERR_PTR(-ENOSPC);
}
}
- f2fs_put_dnode(&dn);
+ if (!npage)
+ f2fs_put_dnode(&dn);
repeat:
page = grab_cache_page(mapping, index);
if (!page)
@@ -325,6 +339,8 @@ repeat:
if (new_i_size &&
i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ /* Only the directory inode sets new_i_size */
+ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
mark_inode_dirty_sync(inode);
}
return page;
@@ -481,8 +497,9 @@ int do_write_data_page(struct page *page)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
- need_inplace_update(inode)) {
+ if (unlikely(old_blk_addr != NEW_ADDR &&
+ !is_cold_data(page) &&
+ need_inplace_update(inode))) {
rewrite_data_page(F2FS_SB(inode->i_sb), page,
old_blk_addr);
} else {
@@ -684,6 +701,27 @@ err:
return err;
}
+static int f2fs_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ if (pos + copied > i_size_read(inode)) {
+ i_size_write(inode, pos + copied);
+ mark_inode_dirty(inode);
+ update_inode_page(inode);
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+ return copied;
+}
+
static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
@@ -698,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
get_data_block_ro);
}
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
- .write_end = nobh_write_end,
+ .write_end = f2fs_write_end,
.set_page_dirty = f2fs_set_data_page_dirty,
.invalidatepage = f2fs_invalidate_data_page,
.releasepage = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d9943786c31..0d6c6aafb235 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -175,12 +175,12 @@ get_cache:
static int stat_show(struct seq_file *s, void *v)
{
- struct f2fs_stat_info *si, *next;
+ struct f2fs_stat_info *si;
int i = 0;
int j;
mutex_lock(&f2fs_stat_mutex);
- list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+ list_for_each_entry(si, &f2fs_stat_list, stat_list) {
char devname[BDEVNAME_SIZE];
update_general_status(si->sbi);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1ac6b93036b7..9d1cd423450d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
#include "f2fs.h"
#include "node.h"
#include "acl.h"
+#include "xattr.h"
static unsigned long dir_blocks(struct inode *inode)
{
@@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
{
- struct page *page = NULL;
- struct f2fs_dir_entry *de = NULL;
- struct f2fs_dentry_block *dentry_blk = NULL;
+ struct page *page;
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_block *dentry_blk;
page = get_lock_data_page(dir, 0);
if (IS_ERR(page))
@@ -264,15 +265,10 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_put_page(page, 1);
}
-void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_node *rn;
- if (IS_ERR(ipage))
- return;
-
- wait_on_page_writeback(ipage);
-
/* copy name info. to this inode page */
rn = (struct f2fs_node *)page_address(ipage);
rn->i.i_namelen = cpu_to_le32(name->len);
@@ -280,14 +276,15 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-static int make_empty_dir(struct inode *inode, struct inode *parent)
+static int make_empty_dir(struct inode *inode,
+ struct inode *parent, struct page *page)
{
struct page *dentry_page;
struct f2fs_dentry_block *dentry_blk;
struct f2fs_dir_entry *de;
void *kaddr;
- dentry_page = get_new_data_page(inode, 0, true);
+ dentry_page = get_new_data_page(inode, page, 0, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
@@ -317,63 +314,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent)
return 0;
}
-static int init_inode_metadata(struct inode *inode,
+static struct page *init_inode_metadata(struct inode *inode,
struct inode *dir, const struct qstr *name)
{
+ struct page *page;
+ int err;
+
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
- int err;
- err = new_inode_page(inode, name);
- if (err)
- return err;
+ page = new_inode_page(inode, name);
+ if (IS_ERR(page))
+ return page;
if (S_ISDIR(inode->i_mode)) {
- err = make_empty_dir(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ err = make_empty_dir(inode, dir, page);
+ if (err)
+ goto error;
}
err = f2fs_init_acl(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ if (err)
+ goto error;
+
+ err = f2fs_init_security(inode, dir, name, page);
+ if (err)
+ goto error;
+
+ wait_on_page_writeback(page);
} else {
- struct page *ipage;
- ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
- set_cold_node(inode, ipage);
- init_dent_inode(name, ipage);
- f2fs_put_page(ipage, 1);
+ page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ if (IS_ERR(page))
+ return page;
+
+ wait_on_page_writeback(page);
+ set_cold_node(inode, page);
}
+
+ init_dent_inode(name, page);
+
+ /*
+ * This file should be checkpointed during fsync.
+ * We lost i_pino from now on.
+ */
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ file_lost_pino(inode);
inc_nlink(inode);
- update_inode_page(inode);
}
- return 0;
+ return page;
+
+error:
+ f2fs_put_page(page, 1);
+ remove_inode_page(inode);
+ return ERR_PTR(err);
}
static void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- bool need_dir_update = false;
-
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
if (S_ISDIR(inode->i_mode)) {
inc_nlink(dir);
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
if (F2FS_I(dir)->i_current_depth != current_depth) {
F2FS_I(dir)->i_current_depth = current_depth;
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
- if (need_dir_update)
+ if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
update_inode_page(dir);
else
mark_inode_dirty(dir);
@@ -423,6 +433,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
int slots = GET_DENTRY_SLOTS(namelen);
+ struct page *page;
int err = 0;
int i;
@@ -448,7 +459,7 @@ start:
bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
- dentry_page = get_new_data_page(dir, block, true);
+ dentry_page = get_new_data_page(dir, NULL, block, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
@@ -465,12 +476,13 @@ start:
++level;
goto start;
add_dentry:
- err = init_inode_metadata(inode, dir, name);
- if (err)
- goto fail;
-
wait_on_page_writeback(dentry_page);
+ page = init_inode_metadata(inode, dir, name);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
de = &dentry_blk->dentry[bit_pos];
de->hash_code = dentry_hash;
de->name_len = cpu_to_le16(namelen);
@@ -481,11 +493,14 @@ add_dentry:
test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
set_page_dirty(dentry_page);
- update_parent_metadata(dir, inode, current_depth);
-
- /* update parent inode number before releasing dentry page */
+ /* we don't need to mark_inode_dirty now */
F2FS_I(inode)->i_pino = dir->i_ino;
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
+
+ update_parent_metadata(dir, inode, current_depth);
fail:
+ clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
@@ -591,24 +606,19 @@ bool f2fs_empty_dir(struct inode *dir)
return true;
}
-static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int f2fs_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned long pos = file->f_pos;
struct inode *inode = file_inode(file);
unsigned long npages = dir_blocks(inode);
- unsigned char *types = NULL;
unsigned int bit_pos = 0, start_bit_pos = 0;
- int over = 0;
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dir_entry *de = NULL;
struct page *dentry_page = NULL;
- unsigned int n = 0;
+ unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
unsigned char d_type = DT_UNKNOWN;
int slots;
- types = f2fs_filetype_table;
- bit_pos = (pos % NR_DENTRY_IN_BLOCK);
- n = (pos / NR_DENTRY_IN_BLOCK);
+ bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
for ( ; n < npages; n++) {
dentry_page = get_lock_data_page(inode, n);
@@ -618,31 +628,28 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
start_bit_pos = bit_pos;
dentry_blk = kmap(dentry_page);
while (bit_pos < NR_DENTRY_IN_BLOCK) {
- d_type = DT_UNKNOWN;
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
NR_DENTRY_IN_BLOCK,
bit_pos);
if (bit_pos >= NR_DENTRY_IN_BLOCK)
break;
+ ctx->pos += bit_pos - start_bit_pos;
de = &dentry_blk->dentry[bit_pos];
- if (types && de->file_type < F2FS_FT_MAX)
- d_type = types[de->file_type];
-
- over = filldir(dirent,
- dentry_blk->filename[bit_pos],
- le16_to_cpu(de->name_len),
- (n * NR_DENTRY_IN_BLOCK) + bit_pos,
- le32_to_cpu(de->ino), d_type);
- if (over) {
- file->f_pos += bit_pos - start_bit_pos;
+ if (de->file_type < F2FS_FT_MAX)
+ d_type = f2fs_filetype_table[de->file_type];
+ else
+ d_type = DT_UNKNOWN;
+ if (!dir_emit(ctx,
+ dentry_blk->filename[bit_pos],
+ le16_to_cpu(de->name_len),
+ le32_to_cpu(de->ino), d_type))
goto success;
- }
slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
bit_pos += slots;
}
bit_pos = 0;
- file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+ ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
dentry_page = NULL;
@@ -659,7 +666,7 @@ success:
const struct file_operations f2fs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = f2fs_readdir,
+ .iterate = f2fs_readdir,
.fsync = f2fs_sync_file,
.unlocked_ioctl = f2fs_ioctl,
};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02f2a42..467d42d65c48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
typecheck(unsigned long long, b) && \
((long long)((a) - (b)) > 0))
-typedef u64 block_t;
+typedef u32 block_t; /*
+ * should not change u32, since it is the on-disk block
+ * address format, __le32.
+ */
typedef u32 nid_t;
struct f2fs_mount_info {
unsigned int opt;
};
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+
+static inline __u32 f2fs_crc32(void *buf, size_t len)
{
- return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+ unsigned char *p = (unsigned char *)buf;
+ __u32 crc = F2FS_SUPER_MAGIC;
+ int i;
+
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+ }
+ return crc;
}
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
{
- return f2fs_crc32(buff, buff_size) == blk_crc;
+ return f2fs_crc32(buf, buf_size) == blk_crc;
}
/*
@@ -148,7 +162,7 @@ struct extent_info {
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
*/
#define FADVISE_COLD_BIT 0x01
-#define FADVISE_CP_BIT 0x02
+#define FADVISE_LOST_PINO_BIT 0x02
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
@@ -369,7 +383,6 @@ struct f2fs_sb_info {
/* for directory inode management */
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
- unsigned int n_dirty_dirs; /* # of dir inodes */
/* basic file system units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -406,12 +419,15 @@ struct f2fs_sb_info {
* for stat information.
* one is for the LFS mode, and the other is for the SSR mode.
*/
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_stat_info *stat_info; /* FS status information */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
- unsigned int last_victim[2]; /* last victim segment # */
int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
int bg_gc; /* background gc calls */
+ unsigned int n_dirty_dirs; /* # of dir inodes */
+#endif
+ unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
};
@@ -495,9 +511,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
{
- int i = 0;
- for (; i < NR_GLOBAL_LOCKS; i++)
- mutex_lock(&sbi->fs_lock[i]);
+ int i;
+
+ for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+ /*
+ * This is the only time we take multiple fs_lock[]
+ * instances; the order is immaterial since we
+ * always hold cp_mutex, which serializes multiple
+ * such operations.
+ */
+ mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+ }
}
static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -843,9 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_DIRTY_INODE, /* indicate inode is dirty or not */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_UPDATE_DIR, /* should update inode block for consistency */
+ FI_DELAY_IPUT, /* used for the recovery */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
return 0;
}
+static inline int f2fs_readonly(struct super_block *sb)
+{
+ return sb->s_flags & MS_RDONLY;
+}
+
/*
* file.c
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
long f2fs_ioctl(struct file *, unsigned int, unsigned long);
long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -913,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
-void init_dent_inode(const struct qstr *, struct page *);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, const struct qstr *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_inode_page(struct inode *, const struct qstr *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_node_page_ra(struct page *, int);
@@ -974,7 +1007,6 @@ void destroy_node_manager_caches(void);
*/
void f2fs_balance_fs(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
void clear_prefree_segments(struct f2fs_sb_info *);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
@@ -1011,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
void write_checkpoint(struct f2fs_sb_info *, bool);
void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1059,7 @@ int reserve_new_block(struct dnode_of_data *);
void update_extent_cache(block_t, struct dnode_of_data *);
struct page *find_data_page(struct inode *, pgoff_t, bool);
struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
int do_write_data_page(struct page *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864f8dfc..d2d2b7dbdcc1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, ilock);
+ file_update_time(vma->vm_file);
lock_page(page);
if (page->mapping != inode->i_mapping ||
- page_offset(page) >= i_size_read(inode) ||
+ page_offset(page) > i_size_read(inode) ||
!PageUptodate(page)) {
unlock_page(page);
err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
* check to see if the page is mapped already (no holes)
*/
if (PageMappedToDisk(page))
- goto out;
-
- /* fill the page */
- wait_on_page_writeback(page);
+ goto mapped;
/* page is wholly or partially inside EOF */
if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
set_page_dirty(page);
SetPageUptodate(page);
- file_update_time(vma->vm_file);
+mapped:
+ /* fill the page */
+ wait_on_page_writeback(page);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(err);
@@ -102,6 +102,24 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
.remap_pages = generic_file_remap_pages,
};
+static int get_parent_ino(struct inode *inode, nid_t *pino)
+{
+ struct dentry *dentry;
+
+ inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ iput(inode);
+ if (!dentry)
+ return 0;
+
+ inode = igrab(dentry->d_parent->d_inode);
+ dput(dentry);
+
+ *pino = inode->i_ino;
+ iput(inode);
+ return 1;
+}
+
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
@@ -114,7 +132,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
.for_reclaim = 0,
};
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(inode->i_sb))
return 0;
trace_f2fs_sync_file_enter(inode);
@@ -134,7 +152,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
- else if (is_cp_file(inode))
+ else if (file_wrong_pino(inode))
need_cp = true;
else if (!space_for_roll_forward(sbi))
need_cp = true;
@@ -142,11 +160,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
need_cp = true;
if (need_cp) {
+ nid_t pino;
+
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
+ if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+ get_parent_ino(inode, &pino)) {
+ F2FS_I(inode)->i_pino = pino;
+ file_got_pino(inode);
+ mark_inode_dirty_sync(inode);
+ ret = f2fs_write_inode(inode, NULL);
+ if (ret)
+ goto out;
+ }
} else {
/* if there is no written node page, write its inode page */
while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+ mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
@@ -168,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -185,10 +215,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
update_extent_cache(NULL_ADDR, dn);
invalidate_blocks(sbi, blkaddr);
- dec_valid_block_count(sbi, dn->inode, 1);
nr_free++;
}
if (nr_free) {
+ dec_valid_block_count(sbi, dn->inode, nr_free);
set_page_dirty(dn->node_page);
sync_inode_page(dn);
}
@@ -291,7 +321,7 @@ void f2fs_truncate(struct inode *inode)
}
}
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -387,7 +417,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
f2fs_balance_fs(sbi);
ilock = mutex_lock_op(sbi);
- page = get_new_data_page(inode, index, false);
+ page = get_new_data_page(inode, NULL, index, false);
mutex_unlock_op(sbi, ilock);
if (!IS_ERR(page)) {
@@ -575,10 +605,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
int ret;
switch (cmd) {
- case FS_IOC_GETFLAGS:
+ case F2FS_IOC_GETFLAGS:
flags = fi->i_flags & FS_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
- case FS_IOC_SETFLAGS:
+ case F2FS_IOC_SETFLAGS:
{
unsigned int oldflags;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 14961593e93c..35f9b1a196aa 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,7 +76,9 @@ static int gc_thread_func(void *data)
else
wait_ms = increase_sleep_time(wait_ms);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->bg_gc++;
+#endif
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi))
@@ -89,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
+ int err = 0;
if (!test_opt(sbi, BG_GC))
- return 0;
+ goto out;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
- if (!gc_th)
- return -ENOMEM;
+ if (!gc_th) {
+ err = -ENOMEM;
+ goto out;
+ }
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
+ err = PTR_ERR(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
- return -ENOMEM;
}
- return 0;
+
+out:
+ return err;
}
void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
- unsigned int secno;
+ unsigned int secno, max_cost;
int nsearched = 0;
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
- p.min_cost = get_max_cost(sbi, &p);
+ p.min_cost = max_cost = get_max_cost(sbi, &p);
mutex_lock(&dirty_i->seglist_lock);
@@ -280,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_cost = cost;
}
- if (cost == get_max_cost(sbi, &p))
+ if (cost == max_cost)
continue;
if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +295,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
break;
}
}
-got_it:
if (p.min_segno != NULL_SEGNO) {
+got_it:
if (p.alloc_mode == LFS) {
secno = GET_SECNO(sbi, p.min_segno);
if (gc_type == FG_GC)
@@ -314,28 +321,21 @@ static const struct victim_selection default_v_ops = {
static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
{
- struct list_head *this;
struct inode_entry *ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
+ list_for_each_entry(ie, ilist, list)
if (ie->inode->i_ino == ino)
return ie->inode;
- }
return NULL;
}
static void add_gc_inode(struct inode *inode, struct list_head *ilist)
{
- struct list_head *this;
- struct inode_entry *new_ie, *ie;
+ struct inode_entry *new_ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
- if (ie->inode == inode) {
- iput(inode);
- return;
- }
+ if (inode == find_gc_inode(inode->i_ino, ilist)) {
+ iput(inode);
+ return;
}
repeat:
new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9d88ee..2b2d45d19e3e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = do_read_inode(inode);
if (ret)
goto bad_inode;
-
- if (!sbi->por_doing && inode->i_nlink == 0) {
- ret = -ENOENT;
- goto bad_inode;
- }
-
make_now:
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
- __GFP_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page)
set_cold_node(inode, node_page);
set_page_dirty(node_page);
+ clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
}
int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
+ if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+ return 0;
+
if (wbc)
f2fs_balance_fs(sbi);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc9722b17..64c07169df05 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
int count = le32_to_cpu(sbi->raw_super->extension_count);
for (i = 0; i < count; i++) {
if (is_multimedia_file(name, extlist[i])) {
- set_cold_file(inode);
+ file_set_cold(inode);
break;
}
}
@@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
alloc_nid_done(sbi, ino);
- if (!sbi->por_doing)
- d_instantiate(dentry, inode);
+ d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
@@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_balance_fs(sbi);
inode->i_ctime = CURRENT_TIME;
- atomic_inc(&inode->i_count);
+ ihold(inode);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto out;
- /*
- * This file should be checkpointed during fsync.
- * We lost i_pino from now on.
- */
- set_cp_file(inode);
-
d_instantiate(dentry, inode);
return 0;
out:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
- make_bad_inode(inode);
iput(inode);
return err;
}
@@ -498,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
.rename = f2fs_rename,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
@@ -522,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
};
const struct inode_operations f2fs_special_inode_operations = {
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..b418aee09573 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
level = get_node_path(index, offset, noffset);
nids[0] = dn->inode->i_ino;
- npage[0] = get_node_page(sbi, nids[0]);
- if (IS_ERR(npage[0]))
- return PTR_ERR(npage[0]);
+ npage[0] = dn->inode_page;
+ if (!npage[0]) {
+ npage[0] = get_node_page(sbi, nids[0]);
+ if (IS_ERR(npage[0]))
+ return PTR_ERR(npage[0]);
+ }
parent = npage[0];
if (level != 0)
nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
dn->nid = nids[i];
- npage[i] = new_node_page(dn, noffset[i]);
+ npage[i] = new_node_page(dn, noffset[i], NULL);
if (IS_ERR(npage[i])) {
alloc_nid_failed(sbi, nids[i]);
err = PTR_ERR(npage[i]);
@@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode)
return 0;
}
-int new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
{
- struct page *page;
struct dnode_of_data dn;
/* allocate inode page for new inode */
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- page = new_node_page(&dn, 0);
- init_dent_inode(name, page);
- if (IS_ERR(page))
- return PTR_ERR(page);
- f2fs_put_page(page, 1);
- return 0;
+
+ /* caller should f2fs_put_page(page, 1); */
+ return new_node_page(&dn, 0, NULL);
}
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+ unsigned int ofs, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
set_cold_node(dn->inode, page);
dn->node_page = page;
- sync_inode_page(dn);
+ if (ipage)
+ update_inode(dn->inode, ipage);
+ else
+ sync_inode_page(dn);
set_page_dirty(page);
if (ofs == 0)
inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
return 0;
}
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
new_ni = old_ni;
new_ni.ino = ino;
+ if (!inc_valid_node_count(sbi, NULL, 1))
+ WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR);
inc_valid_inode_count(sbi);
-
f2fs_put_page(ipage, 1);
return 0;
}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f0024d..c65fb4f4230f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -275,25 +275,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
{
- return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+ return F2FS_I(inode)->i_advise & type;
}
-static inline void set_cold_file(struct inode *inode)
+static inline void set_file(struct inode *inode, int type)
{
- F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+ F2FS_I(inode)->i_advise |= type;
}
-static inline int is_cp_file(struct inode *inode)
+static inline void clear_file(struct inode *inode, int type)
{
- return F2FS_I(inode)->i_advise & FADVISE_CP_BIT;
+ F2FS_I(inode)->i_advise &= ~type;
}
-static inline void set_cp_file(struct inode *inode)
-{
- F2FS_I(inode)->i_advise |= FADVISE_CP_BIT;
-}
+#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
static inline int is_cold_data(struct page *page)
{
@@ -310,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
ClearPageChecked(page);
}
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
{
void *kaddr = page_address(page);
struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << COLD_BIT_SHIFT);
+ return le32_to_cpu(rn->footer.flag) & (1 << type);
}
-static inline unsigned char is_fsync_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << FSYNC_BIT_SHIFT);
-}
-
-static inline unsigned char is_dent_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << DENT_BIT_SHIFT);
-}
+#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
+#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
static inline void set_cold_node(struct inode *inode, struct page *page)
{
@@ -346,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
rn->footer.flag = cpu_to_le32(flag);
}
-static inline void set_fsync_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- if (mark)
- flag |= (0x1 << FSYNC_BIT_SHIFT);
- else
- flag &= ~(0x1 << FSYNC_BIT_SHIFT);
- rn->footer.flag = cpu_to_le32(flag);
-}
-
-static inline void set_dentry_mark(struct page *page, int mark)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (mark)
- flag |= (0x1 << DENT_BIT_SHIFT);
+ flag |= (0x1 << type);
else
- flag &= ~(0x1 << DENT_BIT_SHIFT);
+ flag &= ~(0x1 << type);
rn->footer.flag = cpu_to_le32(flag);
}
+#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a5097058..d56d951c2253 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
static int recover_dentry(struct page *ipage, struct inode *inode)
{
- struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+ void *kaddr = page_address(ipage);
+ struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
struct f2fs_inode *raw_inode = &(raw_node->i);
- struct qstr name;
+ nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
+ struct qstr name;
struct page *page;
- struct inode *dir;
+ struct inode *dir, *einode;
int err = 0;
- if (!is_dent_dnode(ipage))
- goto out;
-
- dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
+ dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
+ if (!dir) {
+ dir = f2fs_iget(inode->i_sb, pino);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto out;
+ }
+ set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+ add_dirty_dir_inode(dir);
}
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
-
+retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de) {
+ if (de && inode->i_ino == le32_to_cpu(de->ino)) {
kunmap(page);
f2fs_put_page(page, 0);
- } else {
- err = __f2fs_add_link(dir, &name, inode);
+ goto out;
+ }
+ if (de) {
+ einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+ if (IS_ERR(einode)) {
+ WARN_ON(1);
+ if (PTR_ERR(einode) == -ENOENT)
+ err = -EEXIST;
+ goto out;
+ }
+ f2fs_delete_entry(de, page, einode);
+ iput(einode);
+ goto retry;
}
- iput(dir);
+ err = __f2fs_add_link(dir, &name, inode);
out:
- kunmap(ipage);
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+ "ino = %x, name = %s, dir = %lx, err = %d",
+ ino_of_node(ipage), raw_inode->i_name,
+ IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
@@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
struct f2fs_inode *raw_inode = &(raw_node->i);
+ if (!IS_INODE(node_page))
+ return 0;
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_size_write(inode, le64_to_cpu(raw_inode->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- return recover_dentry(node_page, inode);
+ if (is_dent_dnode(node_page))
+ return recover_dentry(node_page, inode);
+
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+ ino_of_node(node_page), raw_inode->i_name);
+ return 0;
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +145,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
lock_page(page);
if (cp_ver != cpver_of_node(page))
- goto unlock_out;
+ break;
if (!is_fsync_dnode(page))
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
if (entry) {
- entry->blkaddr = blkaddr;
if (IS_INODE(page) && is_dent_dnode(page))
set_inode_flag(F2FS_I(entry->inode),
FI_INC_LINK);
@@ -134,48 +159,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
- goto unlock_out;
+ break;
}
/* add this fsync inode to the list */
entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
if (!entry) {
err = -ENOMEM;
- goto unlock_out;
+ break;
}
entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
if (IS_ERR(entry->inode)) {
err = PTR_ERR(entry->inode);
kmem_cache_free(fsync_entry_slab, entry);
- goto unlock_out;
+ break;
}
-
list_add_tail(&entry->list, head);
- entry->blkaddr = blkaddr;
- }
- if (IS_INODE(page)) {
- err = recover_inode(entry->inode, page);
- if (err == -ENOENT) {
- goto next;
- } else if (err) {
- err = -EINVAL;
- goto unlock_out;
- }
}
+ entry->blkaddr = blkaddr;
+
+ err = recover_inode(entry->inode, page);
+ if (err && err != -ENOENT)
+ break;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
-unlock_out:
unlock_page(page);
out:
__free_pages(page, 0);
return err;
}
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
- struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
@@ -186,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
}
}
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
- block_t blkaddr)
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+ block_t blkaddr, struct dnode_of_data *dn)
{
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
(sbi->blocks_per_seg - 1);
struct f2fs_summary sum;
- nid_t ino;
+ nid_t ino, nid;
void *kaddr;
struct inode *inode;
struct page *node_page;
@@ -203,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
sentry = get_seg_entry(sbi, segno);
if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
- return;
+ return 0;
/* Get the previous summary */
for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +239,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
f2fs_put_page(sum_page, 1);
}
+ /* Use the locked dnode page and inode */
+ nid = le32_to_cpu(sum.nid);
+ if (dn->inode->i_ino == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.nid = nid;
+ tdn.node_page = dn->inode_page;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ } else if (dn->nid == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ }
+
/* Get the node page */
- node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
bidx = start_bidx_of_node(ofs_of_node(node_page)) +
- le16_to_cpu(sum.ofs_in_node);
+ le16_to_cpu(sum.ofs_in_node);
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
/* Deallocate previous index in the node page */
inode = f2fs_iget(sbi->sb, ino);
if (IS_ERR(inode))
- return;
+ return PTR_ERR(inode);
truncate_hole(inode, bidx, bidx + 1);
iput(inode);
+ return 0;
}
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +281,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
- int err = 0;
+ int err = 0, recovered = 0;
int ilock;
start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +319,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* Check the previous node page having this index */
- check_index_in_prev_nodes(sbi, dest);
+ err = check_index_in_prev_nodes(sbi, dest, &dn);
+ if (err)
+ goto err;
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* write dummy data page */
recover_data_page(sbi, NULL, &sum, src, dest);
update_extent_cache(dest, &dn);
+ recovered++;
}
dn.ofs_in_node++;
}
@@ -305,9 +344,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
set_page_dirty(dn.node_page);
recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, ilock);
- return 0;
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+ "recovered_data = %d blocks, err = %d",
+ inode->i_ino, recovered, err);
+ return err;
}
static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +384,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
lock_page(page);
if (cp_ver != cpver_of_node(page))
- goto unlock_out;
+ break;
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry)
@@ -348,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
err = do_recover_data(sbi, entry->inode, page, blkaddr);
if (err)
- goto out;
+ break;
if (entry->blkaddr == blkaddr) {
iput(entry->inode);
@@ -359,7 +403,6 @@ next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
-unlock_out:
unlock_page(page);
out:
__free_pages(page, 0);
@@ -382,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&inode_list);
/* step #1: find fsynced inode numbers */
+ sbi->por_doing = 1;
err = find_fsync_dnodes(sbi, &inode_list);
if (err)
goto out;
@@ -390,13 +434,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
goto out;
/* step #2: recover data */
- sbi->por_doing = 1;
err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- sbi->por_doing = 0;
BUG_ON(!list_empty(&inode_list));
out:
- destroy_fsync_dnodes(sbi, &inode_list);
+ destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
- write_checkpoint(sbi, false);
+ sbi->por_doing = 0;
+ if (!err)
+ write_checkpoint(sbi, false);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..a86d125a9885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
* Adding dirty entry into seglist is not critical operation.
* If a given segment is one of current working segments, it won't be added.
*/
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned short valid_blocks;
@@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
__set_test_and_free(sbi, segno);
- offset = segno + 1;
}
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
- offset = segno + 1;
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
dirty_i->nr_dirty[PRE]--;
@@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
* This function should be resided under the curseg_mutex lock
*/
static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
- struct f2fs_summary *sum, unsigned short offset)
+ struct f2fs_summary *sum)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
void *addr = curseg->sum_blk;
- addr += offset * sizeof(struct f2fs_summary);
+ addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
memcpy(addr, sum, sizeof(struct f2fs_summary));
return;
}
@@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
-{
- struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
- unsigned int segno;
- unsigned int ofs = 0;
-
- /*
- * If there is not enough reserved sections,
- * we should not reuse prefree segments.
- */
- if (has_not_enough_free_secs(sbi, 0))
- return NULL_SEGNO;
-
- /*
- * NODE page should not reuse prefree segment,
- * since those information is used for SPOR.
- */
- if (IS_NODESEG(type))
- return NULL_SEGNO;
-next:
- segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
- ofs += sbi->segs_per_sec;
-
- if (segno < TOTAL_SEGS(sbi)) {
- int i;
-
- /* skip intermediate segments in a section */
- if (segno % sbi->segs_per_sec)
- goto next;
-
- /* skip if the section is currently used */
- if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
- goto next;
-
- /* skip if whole section is not prefree */
- for (i = 1; i < sbi->segs_per_sec; i++)
- if (!test_bit(segno + i, prefree_segmap))
- goto next;
-
- /* skip if whole section was not free at the last checkpoint */
- for (i = 0; i < sbi->segs_per_sec; i++)
- if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
- goto next;
-
- return segno;
- }
- return NULL_SEGNO;
-}
-
static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned int segno = curseg->segno;
+ unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
- if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec)
- return !test_bit(segno + 1, free_i->free_segmap);
+ if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+ return !test_bit(segno, free_i->free_segmap);
return 0;
}
@@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
int dir = ALLOC_LEFT;
write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ GET_SUM_BLOCK(sbi, segno));
if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
dir = ALLOC_RIGHT;
@@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
goto out;
}
- curseg->next_segno = check_prefree_segments(sbi, type);
-
- if (curseg->next_segno != NULL_SEGNO)
- change_curseg(sbi, type, false);
- else if (type == CURSEG_WARM_NODE)
+ if (type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
else
new_curseg(sbi, type, false);
out:
+#ifdef CONFIG_F2FS_STAT_FS
sbi->segment_count[curseg->alloc_type]++;
+#endif
+ return;
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -795,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
if (S_ISDIR(inode->i_mode))
return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || is_cold_file(inode))
+ else if (is_cold_data(page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
else
return CURSEG_WARM_DATA;
@@ -844,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
* because, this function updates a summary entry in the
* current summary block.
*/
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
mutex_lock(&sit_i->sentry_lock);
__refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->block_count[curseg->alloc_type]++;
+#endif
/*
* SIT information should be updated before segment allocation,
@@ -943,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
@@ -980,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
}
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
/* change the current log to the next block addr in advance */
if (next_segno != segno) {
@@ -1579,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int segno = 0, offset = 0;
+ unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
unsigned short valid_blocks;
- while (segno < TOTAL_SEGS(sbi)) {
+ while (1) {
/* find dirty segment based on free segmap */
- segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
- if (segno >= TOTAL_SEGS(sbi))
+ segno = find_next_inuse(free_i, total_segs, offset);
+ if (segno >= total_segs)
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7df82c7..75c7dc363e92 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -34,7 +34,7 @@
static struct kmem_cache *f2fs_inode_cachep;
enum {
- Opt_gc_background_off,
+ Opt_gc_background,
Opt_disable_roll_forward,
Opt_discard,
Opt_noheap,
@@ -46,7 +46,7 @@ enum {
};
static match_table_t f2fs_tokens = {
- {Opt_gc_background_off, "background_gc_off"},
+ {Opt_gc_background, "background_gc=%s"},
{Opt_disable_roll_forward, "disable_roll_forward"},
{Opt_discard, "discard"},
{Opt_noheap, "no_heap"},
@@ -76,6 +76,91 @@ static void init_once(void *foo)
inode_init_once(&fi->vfs_inode);
}
+static int parse_options(struct super_block *sb, char *options)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *name;
+ int arg = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, f2fs_tokens, args);
+
+ switch (token) {
+ case Opt_gc_background:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (!strncmp(name, "on", 2))
+ set_opt(sbi, BG_GC);
+ else if (!strncmp(name, "off", 3))
+ clear_opt(sbi, BG_GC);
+ else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_disable_roll_forward:
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_discard:
+ set_opt(sbi, DISCARD);
+ break;
+ case Opt_noheap:
+ set_opt(sbi, NOHEAP);
+ break;
+#ifdef CONFIG_F2FS_FS_XATTR
+ case Opt_nouser_xattr:
+ clear_opt(sbi, XATTR_USER);
+ break;
+#else
+ case Opt_nouser_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "nouser_xattr options not supported");
+ break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ case Opt_noacl:
+ clear_opt(sbi, POSIX_ACL);
+ break;
+#else
+ case Opt_noacl:
+ f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+ break;
+#endif
+ case Opt_active_logs:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+ return -EINVAL;
+ sbi->active_logs = arg;
+ break;
+ case Opt_disable_ext_identify:
+ set_opt(sbi, DISABLE_EXT_IDENTIFY);
+ break;
+ default:
+ f2fs_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" or missing value",
+ p);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
@@ -112,6 +197,17 @@ static int f2fs_drop_inode(struct inode *inode)
return generic_drop_inode(inode);
}
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+ set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ return;
+}
+
static void f2fs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -170,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb)
{
int err;
- if (sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(sb))
return 0;
err = f2fs_sync_fs(sb, 1);
@@ -214,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (test_opt(sbi, BG_GC))
- seq_puts(seq, ",background_gc_on");
+ if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
+ seq_printf(seq, ",background_gc=%s", "on");
else
- seq_puts(seq, ",background_gc_off");
+ seq_printf(seq, ",background_gc=%s", "off");
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
@@ -244,11 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_mount_info org_mount_opt;
+ int err, active_logs;
+
+ /*
+ * Save the old mount options in case we
+ * need to restore them.
+ */
+ org_mount_opt = sbi->mount_opt;
+ active_logs = sbi->active_logs;
+
+ /* parse mount options */
+ err = parse_options(sb, data);
+ if (err)
+ goto restore_opts;
+
+ /*
+ * Previous and new state of filesystem is RO,
+ * so no point in checking GC conditions.
+ */
+ if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+ goto skip;
+
+ /*
+ * We stop the GC thread if FS is mounted as RO
+ * or if background_gc = off is passed in mount
+ * option. Also sync the filesystem.
+ */
+ if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if (sbi->gc_thread) {
+ stop_gc_thread(sbi);
+ f2fs_sync_fs(sb, 1);
+ }
+ } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+ err = start_gc_thread(sbi);
+ if (err)
+ goto restore_opts;
+ }
+skip:
+ /* Update the POSIXACL Flag */
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ return 0;
+
+restore_opts:
+ sbi->mount_opt = org_mount_opt;
+ sbi->active_logs = active_logs;
+ return err;
+}
+
static struct super_operations f2fs_sops = {
.alloc_inode = f2fs_alloc_inode,
.drop_inode = f2fs_drop_inode,
.destroy_inode = f2fs_destroy_inode,
.write_inode = f2fs_write_inode,
+ .dirty_inode = f2fs_dirty_inode,
.show_options = f2fs_show_options,
.evict_inode = f2fs_evict_inode,
.put_super = f2fs_put_super,
@@ -256,6 +405,7 @@ static struct super_operations f2fs_sops = {
.freeze_fs = f2fs_freeze,
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
+ .remount_fs = f2fs_remount,
};
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
- char *options)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int arg = 0;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, f2fs_tokens, args);
-
- switch (token) {
- case Opt_gc_background_off:
- clear_opt(sbi, BG_GC);
- break;
- case Opt_disable_roll_forward:
- set_opt(sbi, DISABLE_ROLL_FORWARD);
- break;
- case Opt_discard:
- set_opt(sbi, DISCARD);
- break;
- case Opt_noheap:
- set_opt(sbi, NOHEAP);
- break;
-#ifdef CONFIG_F2FS_FS_XATTR
- case Opt_nouser_xattr:
- clear_opt(sbi, XATTR_USER);
- break;
-#else
- case Opt_nouser_xattr:
- f2fs_msg(sb, KERN_INFO,
- "nouser_xattr options not supported");
- break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
- case Opt_noacl:
- clear_opt(sbi, POSIX_ACL);
- break;
-#else
- case Opt_noacl:
- f2fs_msg(sb, KERN_INFO, "noacl options not supported");
- break;
-#endif
- case Opt_active_logs:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
- return -EINVAL;
- sbi->active_logs = arg;
- break;
- case Opt_disable_ext_identify:
- set_opt(sbi, DISABLE_EXT_IDENTIFY);
- break;
- default:
- f2fs_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
- }
- return 0;
-}
-
static loff_t max_file_size(unsigned bits)
{
loff_t result = ADDRS_PER_INODE;
@@ -541,6 +618,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto free_sb_buf;
}
+ sb->s_fs_info = sbi;
/* init some FS parameters */
sbi->active_logs = NR_CURSEG_TYPE;
@@ -553,7 +631,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi, POSIX_ACL);
#endif
/* parse mount options */
- err = parse_options(sb, sbi, (char *)data);
+ err = parse_options(sb, (char *)data);
if (err)
goto free_sb_buf;
@@ -565,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
- sb->s_fs_info = sbi;
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,10 +751,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
"Cannot recover all fsync data errno=%ld", err);
}
- /* After POR, we can run background GC thread */
- err = start_gc_thread(sbi);
- if (err)
- goto fail;
+ /*
+ * If filesystem is not mounted as read-only then
+ * do start the gc_thread.
+ */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ /* After POR, we can run background GC thread.*/
+ err = start_gc_thread(sbi);
+ if (err)
+ goto fail;
+ }
err = f2fs_build_stats(sbi);
if (err)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce31356..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
*/
#include <linux/rwsem.h>
#include <linux/f2fs_fs.h>
+#include <linux/security.h>
#include "f2fs.h"
#include "xattr.h"
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
prefix = XATTR_TRUSTED_PREFIX;
prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ break;
default:
return -EINVAL;
}
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
total_len = prefix_len + name_len + 1;
if (list && total_len <= list_size) {
memcpy(list, prefix, prefix_len);
- memcpy(list+prefix_len, name, name_len);
+ memcpy(list + prefix_len, name, name_len);
list[prefix_len + name_len] = '\0';
}
return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(dentry->d_inode, type, name,
- buffer, size);
+ return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
}
static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+ return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
}
static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
return 0;
}
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *page)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, (struct page *)page);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &f2fs_initxattrs, ipage);
+}
+#endif
+
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
.set = f2fs_xattr_advise_set,
};
+const struct xattr_handler f2fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = F2FS_XATTR_INDEX_SECURITY,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
#endif
[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
};
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
&f2fs_xattr_acl_default_handler,
#endif
&f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ &f2fs_xattr_security_handler,
+#endif
&f2fs_xattr_advise_handler,
NULL,
};
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
return -ENODATA;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return 0;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
}
int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len)
+ const void *value, size_t value_len, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
mark_inode_dirty(inode);
- page = new_node_page(&dn, XATTR_NODE_OFFSET);
+ page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
if (IS_ERR(page)) {
alloc_nid_failed(sbi, fi->i_xattr_nid);
fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
inode->i_ctime = CURRENT_TIME;
clear_inode_flag(fi, FI_ACL_MODE);
}
- update_inode_page(inode);
+ if (ipage)
+ update_inode(inode, ipage);
+ else
+ update_inode_page(inode);
mutex_unlock_op(sbi, ilock);
return 0;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
extern const struct xattr_handler f2fs_xattr_acl_access_handler;
extern const struct xattr_handler f2fs_xattr_acl_default_handler;
extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
extern const struct xattr_handler *f2fs_xattr_handlers[];
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t buffer_size);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
- size_t buffer_size);
-
+extern int f2fs_setxattr(struct inode *, int, const char *,
+ const void *, size_t, struct page *);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#else
#define f2fs_xattr_handlers NULL
static inline int f2fs_setxattr(struct inode *inode, int name_index,
- const char *name, const void *value, size_t value_len)
+ const char *name, const void *value, size_t value_len)
{
return -EOPNOTSUPP;
}
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
}
#endif
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+ const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return 0;
+}
+#endif
#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7a6f02caf286..3963ede84eb0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -543,6 +543,7 @@ end_of_dir:
EXPORT_SYMBOL_GPL(fat_search_long);
struct fat_ioctl_filldir_callback {
+ struct dir_context ctx;
void __user *dirent;
int result;
/* for dir ioctl */
@@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback {
int short_len;
};
-static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
- filldir_t filldir, int short_only, int both)
+static int __fat_readdir(struct inode *inode, struct file *file,
+ struct dir_context *ctx, int short_only,
+ struct fat_ioctl_filldir_callback *both)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
unsigned char bufname[FAT_MAX_SHORT_SIZE];
int isvfat = sbi->options.isvfat;
const char *fill_name = NULL;
- unsigned long inum;
- unsigned long lpos, dummy, *furrfu = &lpos;
+ int fake_offset = 0;
loff_t cpos;
int short_len = 0, fill_len = 0;
int ret = 0;
mutex_lock(&sbi->s_lock);
- cpos = filp->f_pos;
+ cpos = ctx->pos;
/* Fake . and .. for the root directory. */
if (inode->i_ino == MSDOS_ROOT_INO) {
- while (cpos < 2) {
- if (filldir(dirent, "..", cpos+1, cpos,
- MSDOS_ROOT_INO, DT_DIR) < 0)
- goto out;
- cpos++;
- filp->f_pos++;
- }
- if (cpos == 2) {
- dummy = 2;
- furrfu = &dummy;
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (ctx->pos == 2) {
+ fake_offset = 1;
cpos = 0;
}
}
@@ -619,7 +614,7 @@ parse_record:
int status = fat_parse_long(inode, &cpos, &bh, &de,
&unicode, &nr_slots);
if (status < 0) {
- filp->f_pos = cpos;
+ ctx->pos = cpos;
ret = status;
goto out;
} else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@ parse_record:
/* !both && !short_only, so we don't need shortname. */
if (!both)
goto start_filldir;
+
+ short_len = fat_parse_short(sb, de, bufname,
+ sbi->options.dotsOK);
+ if (short_len == 0)
+ goto record_end;
+ /* hack for fat_ioctl_filldir() */
+ both->longname = fill_name;
+ both->long_len = fill_len;
+ both->shortname = bufname;
+ both->short_len = short_len;
+ fill_name = NULL;
+ fill_len = 0;
+ goto start_filldir;
}
}
@@ -646,28 +654,21 @@ parse_record:
if (short_len == 0)
goto record_end;
- if (nr_slots) {
- /* hack for fat_ioctl_filldir() */
- struct fat_ioctl_filldir_callback *p = dirent;
-
- p->longname = fill_name;
- p->long_len = fill_len;
- p->shortname = bufname;
- p->short_len = short_len;
- fill_name = NULL;
- fill_len = 0;
- } else {
- fill_name = bufname;
- fill_len = short_len;
- }
+ fill_name = bufname;
+ fill_len = short_len;
start_filldir:
- lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
- if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
- inum = inode->i_ino;
- else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
- inum = parent_ino(filp->f_path.dentry);
+ if (!fake_offset)
+ ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+
+ if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
+ if (!dir_emit_dot(file, ctx))
+ goto fill_failed;
+ } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+ if (!dir_emit_dotdot(file, ctx))
+ goto fill_failed;
} else {
+ unsigned long inum;
loff_t i_pos = fat_make_i_pos(sb, bh, de);
struct inode *tmp = fat_iget(sb, i_pos);
if (tmp) {
@@ -675,18 +676,17 @@ start_filldir:
iput(tmp);
} else
inum = iunique(sb, MSDOS_ROOT_INO);
+ if (!dir_emit(ctx, fill_name, fill_len, inum,
+ (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
+ goto fill_failed;
}
- if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
- (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
- goto fill_failed;
-
record_end:
- furrfu = &lpos;
- filp->f_pos = cpos;
+ fake_offset = 0;
+ ctx->pos = cpos;
goto get_new;
end_of_dir:
- filp->f_pos = cpos;
+ ctx->pos = cpos;
fill_failed:
brelse(bh);
if (unicode)
@@ -696,10 +696,9 @@ out:
return ret;
}
-static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int fat_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
- return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
+ return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
}
#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
@@ -755,20 +754,25 @@ efault: \
FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
-static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+static int fat_ioctl_readdir(struct inode *inode, struct file *file,
void __user *dirent, filldir_t filldir,
int short_only, int both)
{
- struct fat_ioctl_filldir_callback buf;
+ struct fat_ioctl_filldir_callback buf = {
+ .ctx.actor = filldir,
+ .dirent = dirent
+ };
int ret;
buf.dirent = dirent;
buf.result = 0;
mutex_lock(&inode->i_mutex);
+ buf.ctx.pos = file->f_pos;
ret = -ENOENT;
if (!IS_DEADDIR(inode)) {
- ret = __fat_readdir(inode, filp, &buf, filldir,
- short_only, both);
+ ret = __fat_readdir(inode, file, &buf.ctx,
+ short_only, both ? &buf : NULL);
+ file->f_pos = buf.ctx.pos;
}
mutex_unlock(&inode->i_mutex);
if (ret >= 0)
@@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
const struct file_operations fat_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = fat_readdir,
+ .iterate = fat_readdir,
.unlocked_ioctl = fat_dir_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = fat_compat_dir_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index dfce656ddb33..5d4513cb1b3c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1229,6 +1229,19 @@ static int fat_read_root(struct inode *inode)
return 0;
}
+static unsigned long calc_fat_clusters(struct super_block *sb)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+ /* Divide first to avoid overflow */
+ if (sbi->fat_bits != 12) {
+ unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
+ return ent_per_sec * sbi->fat_length;
+ }
+
+ return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+}
+
/*
* Read the super block of an MS-DOS FS.
*/
@@ -1434,7 +1447,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
/* check that FAT table does not overflow */
- fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+ fat_clusters = calc_fat_clusters(sb);
total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
if (total_clusters > MAX_FAT(sb)) {
if (!silent)
diff --git a/fs/file_table.c b/fs/file_table.c
index cd4d87a82951..485dc0eddd67 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -306,17 +306,18 @@ void fput(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
+ unsigned long flags;
+
file_sb_list_del(file);
- if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) {
- unsigned long flags;
- spin_lock_irqsave(&delayed_fput_lock, flags);
- list_add(&file->f_u.fu_list, &delayed_fput_list);
- schedule_work(&delayed_fput_work);
- spin_unlock_irqrestore(&delayed_fput_lock, flags);
- return;
+ if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+ init_task_work(&file->f_u.fu_rcuhead, ____fput);
+ if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
+ return;
}
- init_task_work(&file->f_u.fu_rcuhead, ____fput);
- task_work_add(task, &file->f_u.fu_rcuhead, true);
+ spin_lock_irqsave(&delayed_fput_lock, flags);
+ list_add(&file->f_u.fu_list, &delayed_fput_list);
+ schedule_work(&delayed_fput_work);
+ spin_unlock_irqrestore(&delayed_fput_lock, flags);
}
}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 664b07a53870..25d4099a4aea 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -49,7 +49,7 @@
static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int);
-static int vxfs_readdir(struct file *, void *, filldir_t);
+static int vxfs_readdir(struct file *, struct dir_context *);
const struct inode_operations vxfs_dir_inode_ops = {
.lookup = vxfs_lookup,
@@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = {
const struct file_operations vxfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = vxfs_readdir,
+ .iterate = vxfs_readdir,
};
@@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
* Zero.
*/
static int
-vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
+vxfs_readdir(struct file *fp, struct dir_context *ctx)
{
struct inode *ip = file_inode(fp);
struct super_block *sbp = ip->i_sb;
@@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
u_long page, npages, block, pblocks, nblocks, offset;
loff_t pos;
- switch ((long)fp->f_pos) {
- case 0:
- if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
- goto out;
- fp->f_pos++;
- /* fallthrough */
- case 1:
- if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
- goto out;
- fp->f_pos++;
- /* fallthrough */
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(fp, ctx))
+ return 0;
+ ctx->pos = 1;
}
-
- pos = fp->f_pos - 2;
+ if (ctx->pos == 1) {
+ if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
+ return 0;
+ ctx->pos = 2;
+ }
+ pos = ctx->pos - 2;
if (pos > VXFS_DIRROUND(ip->i_size))
return 0;
@@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
for (; page < npages; page++, block = 0) {
- caddr_t kaddr;
+ char *kaddr;
struct page *pp;
pp = vxfs_get_page(ip->i_mapping, page);
if (IS_ERR(pp))
continue;
- kaddr = (caddr_t)page_address(pp);
+ kaddr = (char *)page_address(pp);
for (; block <= nblocks && block <= pblocks; block++) {
- caddr_t baddr, limit;
+ char *baddr, *limit;
struct vxfs_dirblk *dbp;
struct vxfs_direct *de;
@@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
(kaddr + offset) :
(baddr + VXFS_DIRBLKOV(dbp)));
- for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
- int over;
-
+ for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
if (!de->d_reclen)
break;
if (!de->d_ino)
continue;
- offset = (caddr_t)de - kaddr;
- over = filler(retp, de->d_name, de->d_namelen,
- ((page << PAGE_CACHE_SHIFT) | offset) + 2,
- de->d_ino, DT_UNKNOWN);
- if (over) {
+ offset = (char *)de - kaddr;
+ ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+ if (!dir_emit(ctx, de->d_name, de->d_namelen,
+ de->d_ino, DT_UNKNOWN)) {
vxfs_put_page(pp);
- goto done;
+ return 0;
}
}
offset = 0;
@@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
vxfs_put_page(pp);
offset = 0;
}
-
-done:
- fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-out:
+ ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
return 0;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..a85ac4e33436 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_work {
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
+ unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
- * I/O completion.
+ * I/O completion. We don't do it for sync(2) writeback because it has a
+ * separate, external IO completion path and ->sync_fs for guaranteeing
+ * inode metadata is written back correctly.
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
@@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
+ .for_sync = work->for_sync,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
@@ -1362,6 +1366,7 @@ void sync_inodes_sb(struct super_block *sb)
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
+ .for_sync = 1,
};
/* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
struct fscache_object, cookie_link);
cache = object->cache;
- if (object->state >= FSCACHE_OBJECT_DYING ||
+ if (fscache_object_is_dying(object) ||
test_bit(FSCACHE_IOERROR, &cache->flags))
cache = NULL;
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
BUG_ON(!ifsdef);
cache->flags = 0;
- ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+ ifsdef->event_mask =
+ ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+ ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
if (!tagname)
tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
{
struct fscache_object *object;
- spin_lock(&cache->object_list_lock);
-
while (!list_empty(&cache->object_list)) {
- object = list_entry(cache->object_list.next,
- struct fscache_object, cache_link);
- list_move_tail(&object->cache_link, dying_objects);
+ spin_lock(&cache->object_list_lock);
- _debug("withdraw %p", object->cookie);
+ if (!list_empty(&cache->object_list)) {
+ object = list_entry(cache->object_list.next,
+ struct fscache_object, cache_link);
+ list_move_tail(&object->cache_link, dying_objects);
- spin_lock(&object->lock);
- spin_unlock(&cache->object_list_lock);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
- spin_unlock(&object->lock);
+ _debug("withdraw %p", object->cookie);
+
+ /* This must be done under object_list_lock to prevent
+ * a race with fscache_drop_object().
+ */
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+ }
+ spin_unlock(&cache->object_list_lock);
cond_resched();
- spin_lock(&cache->object_list_lock);
}
-
- spin_unlock(&cache->object_list_lock);
}
/**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
atomic_set(&cookie->usage, 1);
atomic_set(&cookie->n_children, 0);
+ /* We keep the active count elevated until relinquishment to prevent an
+ * attempt to wake up every time the object operations queue quiesces.
+ */
+ atomic_set(&cookie->n_active, 1);
+
atomic_inc(&parent->usage);
atomic_inc(&parent->n_children);
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
cookie->flags =
(1 << FSCACHE_COOKIE_LOOKING_UP) |
- (1 << FSCACHE_COOKIE_CREATING) |
(1 << FSCACHE_COOKIE_NO_DATA_YET);
/* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
/* initiate the process of looking up all the objects in the chain
* (done by fscache_initialise_object()) */
- fscache_enqueue_object(object);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
spin_unlock(&cookie->lock);
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
object_already_extant:
ret = -ENOBUFS;
- if (object->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dead(object)) {
spin_unlock(&cookie->lock);
goto error;
}
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
ret = -EEXIST;
hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_dying(p))
ret = -ENOBUFS;
goto cant_attach_object;
}
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
hlist_for_each_entry(p, &cookie->parent->backing_objects,
cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dying(p)) {
ret = -ENOBUFS;
spin_unlock(&cookie->parent->lock);
goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object,
cookie_link);
- if (object->state < FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_live(object))
fscache_raise_event(
object, FSCACHE_OBJECT_EV_INVALIDATE);
}
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
*/
void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
{
- struct fscache_cache *cache;
struct fscache_object *object;
- unsigned long event;
fscache_stat(&fscache_n_relinquishes);
if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
return;
}
- _enter("%p{%s,%p},%d",
- cookie, cookie->def->name, cookie->netfs_data, retire);
+ _enter("%p{%s,%p,%d},%d",
+ cookie, cookie->def->name, cookie->netfs_data,
+ atomic_read(&cookie->n_active), retire);
+
+ ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
if (atomic_read(&cookie->n_children) != 0) {
printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
BUG();
}
- /* wait for the cookie to finish being instantiated (or to fail) */
- if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
- fscache_stat(&fscache_n_relinquishes_waitcrt);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- }
-
- event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+ /* No further netfs-accessing operations on this cookie permitted */
+ set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+ if (retire)
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
-try_again:
spin_lock(&cookie->lock);
-
- /* break links with all the active objects */
- while (!hlist_empty(&cookie->backing_objects)) {
- int n_reads;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object,
- cookie_link);
-
- _debug("RELEASE OBJ%x", object->debug_id);
-
- set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
- n_reads = atomic_read(&object->n_reads);
- if (n_reads) {
- int n_ops = object->n_ops;
- int n_in_progress = object->n_in_progress;
- spin_unlock(&cookie->lock);
- printk(KERN_ERR "FS-Cache:"
- " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
- cookie->def->name,
- n_reads, n_ops, n_in_progress);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- printk("Wait finished\n");
- goto try_again;
- }
-
- /* detach each cache object from the object cookie */
- spin_lock(&object->lock);
- hlist_del_init(&object->cookie_link);
-
- cache = object->cache;
- object->cookie = NULL;
- fscache_raise_event(object, event);
- spin_unlock(&object->lock);
-
- if (atomic_dec_and_test(&cookie->usage))
- /* the cookie refcount shouldn't be reduced to 0 yet */
- BUG();
+ hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
}
+ spin_unlock(&cookie->lock);
- /* detach pointers back to the netfs */
+ /* Wait for cessation of activity requiring access to the netfs (when
+ * n_active reaches 0).
+ */
+ if (!atomic_dec_and_test(&cookie->n_active))
+ wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+
+ /* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
cookie->def = NULL;
-
- spin_unlock(&cookie->lock);
+ BUG_ON(cookie->stores.rnode);
if (cookie->parent) {
ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
atomic_dec(&cookie->parent->n_children);
}
- /* finally dispose of the cookie */
+ /* Dispose of the netfs's link to the cookie */
ASSERTCMP(atomic_read(&cookie->usage), >, 0);
fscache_cookie_put(cookie);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
struct fscache_cookie fscache_fsdef_index = {
.usage = ATOMIC_INIT(1),
+ .n_active = ATOMIC_INIT(1),
.lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
.backing_objects = HLIST_HEAD_INIT,
.def = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
extern int fscache_wait_bit(void *);
extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
/*
* object.c
*/
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-
-extern void fscache_withdrawing_object(struct fscache_cache *,
- struct fscache_object *);
extern void fscache_enqueue_object(struct fscache_object *);
/*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
extern const struct file_operations fscache_objlist_fops;
extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
#else
#define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
#endif
/*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
unsigned event)
{
BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+ printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+ object->debug_id, object->event_mask, (1 << event));
+#endif
if (!test_and_set_bit(event, &object->events) &&
test_bit(event, &object->event_mask))
fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
schedule();
return 0;
}
-EXPORT_SYMBOL(fscache_wait_bit);
/*
* wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
schedule();
return signal_pending(current);
}
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
/* initialise the primary index cookie */
atomic_set(&netfs->primary_index->usage, 1);
atomic_set(&netfs->primary_index->n_children, 0);
+ atomic_set(&netfs->primary_index->n_active, 1);
netfs->primary_index->def = &fscache_fsdef_netfs_def;
netfs->primary_index->parent = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-/**
- * fscache_object_destroy - Note that a cache object is about to be destroyed
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
+/*
+ * Remove an object from the object list.
*/
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
{
write_lock(&fscache_object_list_lock);
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-EXPORT_SYMBOL(fscache_object_destroy);
/*
* find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
{
struct fscache_objlist_data *data = m->private;
struct fscache_object *obj = v;
+ struct fscache_cookie *cookie;
unsigned long config = data->config;
- uint16_t keylen, auxlen;
char _type[3], *type;
- bool no_cookie;
u8 *buf = data->buf, *p;
if ((unsigned long) v == 1) {
seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
- " EM EV F S"
+ " EM EV FL S"
" | NETFS_COOKIE_DEF TY FL NETFS_DATA");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
if ((unsigned long) v == 2) {
seq_puts(m, "======== ======== ==== ===== === === === == ====="
- " == == = ="
+ " == == == ="
" | ================ == == ================");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
} \
} while(0)
+ cookie = obj->cookie;
if (~config) {
- FILTER(obj->cookie,
+ FILTER(cookie->def,
COOKIE, NOCOOKIE);
- FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+ FILTER(fscache_object_is_active(obj) ||
obj->n_ops != 0 ||
obj->n_obj_ops != 0 ||
obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
}
seq_printf(m,
- "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+ "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
obj->debug_id,
obj->parent ? obj->parent->debug_id : -1,
- fscache_object_states_short[obj->state],
+ obj->state->short_name,
obj->n_children,
obj->n_ops,
obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
obj->flags,
work_busy(&obj->work));
- no_cookie = true;
- keylen = auxlen = 0;
- if (obj->cookie) {
- spin_lock(&obj->lock);
- if (obj->cookie) {
- switch (obj->cookie->def->type) {
- case 0:
- type = "IX";
- break;
- case 1:
- type = "DT";
- break;
- default:
- sprintf(_type, "%02u",
- obj->cookie->def->type);
- type = _type;
- break;
- }
+ if (fscache_use_cookie(obj)) {
+ uint16_t keylen = 0, auxlen = 0;
- seq_printf(m, "%-16s %s %2lx %16p",
- obj->cookie->def->name,
- type,
- obj->cookie->flags,
- obj->cookie->netfs_data);
-
- if (obj->cookie->def->get_key &&
- config & FSCACHE_OBJLIST_CONFIG_KEY)
- keylen = obj->cookie->def->get_key(
- obj->cookie->netfs_data,
- buf, 400);
-
- if (obj->cookie->def->get_aux &&
- config & FSCACHE_OBJLIST_CONFIG_AUX)
- auxlen = obj->cookie->def->get_aux(
- obj->cookie->netfs_data,
- buf + keylen, 512 - keylen);
-
- no_cookie = false;
+ switch (cookie->def->type) {
+ case 0:
+ type = "IX";
+ break;
+ case 1:
+ type = "DT";
+ break;
+ default:
+ sprintf(_type, "%02u", cookie->def->type);
+ type = _type;
+ break;
}
- spin_unlock(&obj->lock);
- if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+ seq_printf(m, "%-16s %s %2lx %16p",
+ cookie->def->name,
+ type,
+ cookie->flags,
+ cookie->netfs_data);
+
+ if (cookie->def->get_key &&
+ config & FSCACHE_OBJLIST_CONFIG_KEY)
+ keylen = cookie->def->get_key(cookie->netfs_data,
+ buf, 400);
+
+ if (cookie->def->get_aux &&
+ config & FSCACHE_OBJLIST_CONFIG_AUX)
+ auxlen = cookie->def->get_aux(cookie->netfs_data,
+ buf + keylen, 512 - keylen);
+ fscache_unuse_cookie(obj);
+
+ if (keylen > 0 || auxlen > 0) {
seq_printf(m, " ");
for (p = buf; keylen > 0; keylen--)
seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
seq_printf(m, "%02x", *p++);
}
}
- }
- if (no_cookie)
- seq_printf(m, "<no_cookie>\n");
- else
seq_printf(m, "\n");
+ } else {
+ seq_printf(m, "<no_netfs>\n");
+ }
return 0;
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/prefetch.h>
#include "internal.h"
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
- [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
- [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
- [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
- [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
- [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
- [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
- [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
- [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
- [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
- [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
- [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
- [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
- [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+
+#define __STATE_NAME(n) fscache_osm_##n
+#define STATE(n) (&__STATE_NAME(n))
+
+/*
+ * Define a work state. Work states are execution states. No event processing
+ * is performed by them. The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition. Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = f \
+ }
+
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+
+/*
+ * Define a wait state. Wait states are event processing states. No execution
+ * is performed by them. Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y". The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = NULL, \
+ .transitions = { __VA_ARGS__, { 0, NULL } } \
+ }
+
+#define TRANSIT_TO(state, emask) \
+ { .events = (emask), .transit_to = STATE(state) }
+
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
+
+static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object);
+
+static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL);
+
+static WAIT_STATE(WAIT_FOR_INIT, "?INI",
+ TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_PARENT, "?PRN",
+ TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY));
+
+static WAIT_STATE(WAIT_FOR_CMD, "?CMD",
+ TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
+ TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE),
+ TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR",
+ TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED));
+
+/*
+ * Out-of-band event transition tables. These are for handling unexpected
+ * events, such as an I/O error. If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+ TRANSIT_TO(ABORT_INIT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+ TRANSIT_TO(LOOKUP_FAILURE,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
-EXPORT_SYMBOL(fscache_object_states);
-
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
- [FSCACHE_OBJECT_INIT] = "INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "LOOK",
- [FSCACHE_OBJECT_CREATING] = "CRTN",
- [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
- [FSCACHE_OBJECT_ACTIVE] = "ACTV",
- [FSCACHE_OBJECT_INVALIDATING] = "INVL",
- [FSCACHE_OBJECT_UPDATING] = "UPDT",
- [FSCACHE_OBJECT_DYING] = "DYNG",
- [FSCACHE_OBJECT_LC_DYING] = "LCDY",
- [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
- [FSCACHE_OBJECT_RELEASING] = "RELS",
- [FSCACHE_OBJECT_RECYCLING] = "RCYC",
- [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
- [FSCACHE_OBJECT_DEAD] = "DEAD",
+
+static const struct fscache_transition fscache_osm_run_oob[] = {
+ TRANSIT_TO(KILL_OBJECT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
static int fscache_get_object(struct fscache_object *);
static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
static void fscache_dequeue_object(struct fscache_object *);
/*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
object->debug_id, parent->debug_id, parent->n_ops);
spin_lock_nested(&parent->lock, 1);
- parent->n_ops--;
parent->n_obj_ops--;
+ parent->n_ops--;
if (parent->n_ops == 0)
fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
spin_unlock(&parent->lock);
}
/*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
*/
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
{
- if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-}
-
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
- enum fscache_object_state new_state;
- struct fscache_cookie *cookie;
- int event;
+ const struct fscache_transition *t;
+ const struct fscache_state *state, *new_state;
+ unsigned long events, event_mask;
+ int event = -1;
ASSERT(object != NULL);
_enter("{OBJ%x,%s,%lx}",
- object->debug_id, fscache_object_states[object->state],
- object->events);
-
- switch (object->state) {
- /* wait for the parent object to become ready */
- case FSCACHE_OBJECT_INIT:
- object->event_mask =
- FSCACHE_OBJECT_EVENTS_MASK &
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- fscache_initialise_object(object);
- goto done;
-
- /* look up the object metadata on disk */
- case FSCACHE_OBJECT_LOOKING_UP:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* create the object metadata on disk */
- case FSCACHE_OBJECT_CREATING:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* handle an object becoming available; start pending
- * operations and queue dependent operations for processing */
- case FSCACHE_OBJECT_AVAILABLE:
- fscache_object_available(object);
- goto active_transit;
-
- /* normal running state */
- case FSCACHE_OBJECT_ACTIVE:
- goto active_transit;
-
- /* Invalidate an object on disk */
- case FSCACHE_OBJECT_INVALIDATING:
- clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
- fscache_stat(&fscache_n_invalidates_run);
- fscache_stat(&fscache_n_cop_invalidate_object);
- fscache_invalidate_object(object);
- fscache_stat_d(&fscache_n_cop_invalidate_object);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
- goto active_transit;
-
- /* update the object metadata on disk */
- case FSCACHE_OBJECT_UPDATING:
- clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
- fscache_stat(&fscache_n_updates_run);
- fscache_stat(&fscache_n_cop_update_object);
- object->cache->ops->update_object(object);
- fscache_stat_d(&fscache_n_cop_update_object);
- goto active_transit;
-
- /* handle an object dying during lookup or creation */
- case FSCACHE_OBJECT_LC_DYING:
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
- fscache_stat(&fscache_n_cop_lookup_complete);
- object->cache->ops->lookup_complete(object);
- fscache_stat_d(&fscache_n_cop_lookup_complete);
-
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DYING;
- cookie = object->cookie;
- if (cookie) {
- if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_LOOKING_UP);
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_CREATING);
+ object->debug_id, object->state->name, object->events);
+
+ event_mask = object->event_mask;
+restart:
+ object->event_mask = 0; /* Mask normal event handling */
+ state = object->state;
+restart_masked:
+ events = object->events;
+
+ /* Handle any out-of-band events (typically an error) */
+ if (events & object->oob_event_mask) {
+ _debug("{OBJ%x} oob %lx",
+ object->debug_id, events & object->oob_event_mask);
+ for (t = object->oob_table; t->events; t++) {
+ if (events & t->events) {
+ state = t->transit_to;
+ ASSERT(state->work != NULL);
+ event = fls(events & t->events) - 1;
+ __clear_bit(event, &object->oob_event_mask);
+ clear_bit(event, &object->events);
+ goto execute_work_state;
+ }
}
- spin_unlock(&object->lock);
+ }
- fscache_done_parent_op(object);
+ /* Wait states are just transition tables */
+ if (!state->work) {
+ if (events & event_mask) {
+ for (t = state->transitions; t->events; t++) {
+ if (events & t->events) {
+ new_state = t->transit_to;
+ event = fls(events & t->events) - 1;
+ clear_bit(event, &object->events);
+ _debug("{OBJ%x} ev %d: %s -> %s",
+ object->debug_id, event,
+ state->name, new_state->name);
+ object->state = state = new_state;
+ goto execute_work_state;
+ }
+ }
- /* wait for completion of all active operations on this object
- * and the death of all child objects of this object */
- case FSCACHE_OBJECT_DYING:
- dying:
- clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
- spin_lock(&object->lock);
- _debug("dying OBJ%x {%d,%d}",
- object->debug_id, object->n_ops, object->n_children);
- if (object->n_ops == 0 && object->n_children == 0) {
- object->event_mask &=
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- object->event_mask |=
- (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR);
- } else {
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- object->event_mask |=
- 1 << FSCACHE_OBJECT_EV_CLEARED;
+ /* The event mask didn't include all the tabled bits */
+ BUG();
}
- spin_unlock(&object->lock);
- fscache_enqueue_dependents(object);
- fscache_start_operations(object);
- goto terminal_transit;
-
- /* handle an abort during initialisation */
- case FSCACHE_OBJECT_ABORT_INIT:
- _debug("handle abort init %lx", object->events);
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-
- spin_lock(&object->lock);
- fscache_dequeue_object(object);
-
- object->state = FSCACHE_OBJECT_DYING;
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &object->cookie->flags))
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_CREATING);
- spin_unlock(&object->lock);
- goto dying;
-
- /* handle the netfs releasing an object and possibly marking it
- * obsolete too */
- case FSCACHE_OBJECT_RELEASING:
- case FSCACHE_OBJECT_RECYCLING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_release_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* handle the parent cache of this object being withdrawn from
- * active service */
- case FSCACHE_OBJECT_WITHDRAWING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_withdraw_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* complain about the object being woken up once it is
- * deceased */
- case FSCACHE_OBJECT_DEAD:
- printk(KERN_ERR "FS-Cache:"
- " Unexpected event in dead state %lx\n",
- object->events & object->event_mask);
- BUG();
-
- default:
- printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
- object->state);
- BUG();
- }
-
- /* determine the transition from a lookup state */
-lookup_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_LC_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_REQUEUE:
- goto done;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ /* Randomly woke up */
+ goto unmask_events;
}
- /* determine the transition from an active state */
-active_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_UPDATE:
- new_state = FSCACHE_OBJECT_UPDATING;
- goto change_state;
- case -1:
- new_state = FSCACHE_OBJECT_ACTIVE;
- goto change_state; /* sleep until event */
- default:
- goto unsupported_event;
- }
+execute_work_state:
+ _debug("{OBJ%x} exec %s", object->debug_id, state->name);
- /* determine the transition from a terminal state */
-terminal_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RETIRE:
- new_state = FSCACHE_OBJECT_RECYCLING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RELEASE:
- new_state = FSCACHE_OBJECT_RELEASING;
- goto change_state;
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_CLEARED:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ new_state = state->work(object, event);
+ event = -1;
+ if (new_state == NO_TRANSIT) {
+ _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+ fscache_enqueue_object(object);
+ event_mask = object->oob_event_mask;
+ goto unmask_events;
}
-change_state:
- spin_lock(&object->lock);
- object->state = new_state;
- spin_unlock(&object->lock);
+ _debug("{OBJ%x} %s -> %s",
+ object->debug_id, state->name, new_state->name);
+ object->state = state = new_state;
-done:
- _leave(" [->%s]", fscache_object_states[object->state]);
- return;
+ if (state->work) {
+ if (unlikely(state->work == ((void *)2UL))) {
+ _leave(" [dead]");
+ return;
+ }
+ goto restart_masked;
+ }
-unsupported_event:
- printk(KERN_ERR "FS-Cache:"
- " Unsupported event %d [%lx/%lx] in state %s\n",
- event, object->events, object->event_mask,
- fscache_object_states[object->state]);
- BUG();
+ /* Transited to wait state */
+ event_mask = object->oob_event_mask;
+ for (t = state->transitions; t->events; t++)
+ event_mask |= t->events;
+
+unmask_events:
+ object->event_mask = event_mask;
+ smp_mb();
+ events = object->events;
+ if (events & event_mask)
+ goto restart;
+ _leave(" [msk %lx]", event_mask);
}
/*
* execute an object
*/
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
{
struct fscache_object *object =
container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
_enter("{OBJ%x}", object->debug_id);
start = jiffies;
- fscache_object_state_machine(object);
+ fscache_object_sm_dispatcher(object);
fscache_hist(fscache_objs_histogram, start);
- if (object->events & object->event_mask)
- fscache_enqueue_object(object);
- clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
fscache_put_object(object);
}
-EXPORT_SYMBOL(fscache_object_work_func);
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+ struct fscache_cookie *cookie,
+ struct fscache_cache *cache)
+{
+ const struct fscache_transition *t;
+
+ atomic_inc(&cache->object_count);
+
+ object->state = STATE(WAIT_FOR_INIT);
+ object->oob_table = fscache_osm_init_oob;
+ object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+ spin_lock_init(&object->lock);
+ INIT_LIST_HEAD(&object->cache_link);
+ INIT_HLIST_NODE(&object->cookie_link);
+ INIT_WORK(&object->work, fscache_object_work_func);
+ INIT_LIST_HEAD(&object->dependents);
+ INIT_LIST_HEAD(&object->dep_link);
+ INIT_LIST_HEAD(&object->pending_ops);
+ object->n_children = 0;
+ object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+ object->events = 0;
+ object->store_limit = 0;
+ object->store_limit_l = 0;
+ object->cache = cache;
+ object->cookie = cookie;
+ object->parent = NULL;
+
+ object->oob_event_mask = 0;
+ for (t = object->oob_table; t->events; t++)
+ object->oob_event_mask |= t->events;
+ object->event_mask = object->oob_event_mask;
+ for (t = object->state->transitions; t->events; t++)
+ object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_event_mask = 0;
+ fscache_dequeue_object(object);
+ return transit_to(KILL_OBJECT);
+}
/*
* initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
* immediately to do a creation
* - we may need to start the process of creating a parent and we need to wait
* for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+ int event)
{
struct fscache_object *parent;
+ bool success;
- _enter("");
- ASSERT(object->cookie != NULL);
- ASSERT(object->cookie->parent != NULL);
-
- if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
- _debug("abort init %lx", object->events);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_ABORT_INIT;
- spin_unlock(&object->lock);
- return;
- }
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->cookie->lock);
- spin_lock_nested(&object->cookie->parent->lock, 1);
+ ASSERT(list_empty(&object->dep_link));
parent = object->parent;
if (!parent) {
- _debug("no parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else {
- spin_lock(&object->lock);
- spin_lock_nested(&parent->lock, 1);
- _debug("parent %s", fscache_object_states[parent->state]);
-
- if (parent->state >= FSCACHE_OBJECT_DYING) {
- _debug("bad parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
- _debug("wait");
-
- /* we may get woken up in this state by child objects
- * binding on to us, so we need to make sure we don't
- * add ourself to the list multiple times */
- if (list_empty(&object->dep_link)) {
- fscache_stat(&fscache_n_cop_grab_object);
- object->cache->ops->grab_object(object);
- fscache_stat_d(&fscache_n_cop_grab_object);
- list_add(&object->dep_link,
- &parent->dependents);
-
- /* fscache_acquire_non_index_cookie() uses this
- * to wake the chain up */
- if (parent->state == FSCACHE_OBJECT_INIT)
- fscache_enqueue_object(parent);
- }
- } else {
- _debug("go");
- parent->n_ops++;
- parent->n_obj_ops++;
- object->lookup_jif = jiffies;
- object->state = FSCACHE_OBJECT_LOOKING_UP;
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- }
+ _leave(" [no parent]");
+ return transit_to(DROP_OBJECT);
+ }
- spin_unlock(&parent->lock);
- spin_unlock(&object->lock);
+ _debug("parent: %s of:%lx", parent->state->name, parent->flags);
+
+ if (fscache_object_is_dying(parent)) {
+ _leave(" [bad parent]");
+ return transit_to(DROP_OBJECT);
}
- spin_unlock(&object->cookie->parent->lock);
- spin_unlock(&object->cookie->lock);
+ if (fscache_object_is_available(parent)) {
+ _leave(" [ready]");
+ return transit_to(PARENT_READY);
+ }
+
+ _debug("wait");
+
+ spin_lock(&parent->lock);
+ fscache_stat(&fscache_n_cop_grab_object);
+ success = false;
+ if (fscache_object_is_live(parent) &&
+ object->cache->ops->grab_object(object)) {
+ list_add(&object->dep_link, &parent->dependents);
+ success = true;
+ }
+ fscache_stat_d(&fscache_n_cop_grab_object);
+ spin_unlock(&parent->lock);
+ if (!success) {
+ _leave(" [grab failed]");
+ return transit_to(DROP_OBJECT);
+ }
+
+ /* fscache_acquire_non_index_cookie() uses this
+ * to wake the chain up */
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+ _leave(" [wait]");
+ return transit_to(WAIT_FOR_PARENT);
+}
+
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+ int event)
+{
+ struct fscache_object *parent = object->parent;
+
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ ASSERT(parent != NULL);
+
+ spin_lock(&parent->lock);
+ parent->n_ops++;
+ parent->n_obj_ops++;
+ object->lookup_jif = jiffies;
+ spin_unlock(&parent->lock);
+
_leave("");
+ return transit_to(LOOK_UP_OBJECT);
}
/*
* look an object up in the cache from which it was allocated
* - we hold an "access lock" on the parent object, so the parent object cannot
* be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+ int event)
{
struct fscache_cookie *cookie = object->cookie;
- struct fscache_object *parent;
+ struct fscache_object *parent = object->parent;
int ret;
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_table = fscache_osm_lookup_oob;
- parent = object->parent;
ASSERT(parent != NULL);
ASSERTCMP(parent->n_ops, >, 0);
ASSERTCMP(parent->n_obj_ops, >, 0);
/* make sure the parent is still available */
- ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
-
- if (parent->state >= FSCACHE_OBJECT_DYING ||
- test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
- _debug("unavailable");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- _leave("");
- return;
+ ASSERT(fscache_object_is_available(parent));
+
+ if (fscache_object_is_dying(parent) ||
+ test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
+ !fscache_use_cookie(object)) {
+ _leave(" [unavailable]");
+ return transit_to(LOOKUP_FAILURE);
}
- _debug("LOOKUP \"%s/%s\" in \"%s\"",
- parent->cookie->def->name, cookie->def->name,
- object->cache->tag->name);
+ _debug("LOOKUP \"%s\" in \"%s\"",
+ cookie->def->name, object->cache->tag->name);
fscache_stat(&fscache_n_object_lookups);
fscache_stat(&fscache_n_cop_lookup_object);
ret = object->cache->ops->lookup_object(object);
fscache_stat_d(&fscache_n_cop_lookup_object);
- if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
- set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ fscache_unuse_cookie(object);
if (ret == -ETIMEDOUT) {
/* probably stuck behind another object, so move this one to
* the back of the queue */
fscache_stat(&fscache_n_object_lookups_timed_out);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ _leave(" [timeout]");
+ return NO_TRANSIT;
}
- _leave("");
+ if (ret < 0) {
+ _leave(" [error]");
+ return transit_to(LOOKUP_FAILURE);
+ }
+
+ _leave(" [ok]");
+ return transit_to(OBJECT_AVAILABLE);
}
/**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_negative);
- /* transit here to allow write requests to begin stacking up
- * and read requests to begin returning ENODATA */
- object->state = FSCACHE_OBJECT_CREATING;
- spin_unlock(&object->lock);
-
- set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+ /* Allow write requests to begin stacking up and read requests to begin
+ * returning ENODATA.
+ */
set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
_debug("wake up lookup %p", &cookie->flags);
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- } else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
- spin_unlock(&object->lock);
}
-
_leave("");
}
EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
/* if we were still looking up, then we must have a positive lookup
* result, in which case there may be data available */
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_positive);
- clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ /* We do (presumably) have data */
+ clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
-
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ /* Allow write requests to begin stacking up and read requests
+ * to begin shovelling data.
+ */
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
} else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
fscache_stat(&fscache_n_object_created);
-
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- smp_wmb();
}
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
-
+ set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
_leave("");
}
EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
/*
* handle an object that has just become available
*/
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+ int event)
{
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->lock);
+ object->oob_table = fscache_osm_run_oob;
- if (object->cookie &&
- test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
- wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+ spin_lock(&object->lock);
fscache_done_parent_op(object);
if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
fscache_stat(&fscache_n_cop_lookup_complete);
object->cache->ops->lookup_complete(object);
fscache_stat_d(&fscache_n_cop_lookup_complete);
- fscache_enqueue_dependents(object);
fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
fscache_stat(&fscache_n_object_avail);
_leave("");
+ return transit_to(JUMPSTART_DEPS);
}
/*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
*/
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+ int event)
{
- struct fscache_object *parent = object->parent;
- struct fscache_cache *cache = object->cache;
+ _enter("{OBJ%x},%d", object->debug_id, event);
- _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+ return NO_TRANSIT; /* Not finished; requeue */
+ return transit_to(WAIT_FOR_CMD);
+}
- ASSERTCMP(object->cookie, ==, NULL);
- ASSERT(hlist_unhashed(&object->cookie_link));
+/*
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+ int event)
+{
+ struct fscache_cookie *cookie;
- spin_lock(&cache->object_list_lock);
- list_del_init(&object->cache_link);
- spin_unlock(&cache->object_list_lock);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_stat(&fscache_n_cop_drop_object);
- cache->ops->drop_object(object);
- fscache_stat_d(&fscache_n_cop_drop_object);
+ object->oob_event_mask = 0;
- if (parent) {
- _debug("release parent OBJ%x {%d}",
- parent->debug_id, parent->n_children);
+ fscache_stat(&fscache_n_cop_lookup_complete);
+ object->cache->ops->lookup_complete(object);
+ fscache_stat_d(&fscache_n_cop_lookup_complete);
- spin_lock(&parent->lock);
- parent->n_children--;
- if (parent->n_children == 0)
- fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
- spin_unlock(&parent->lock);
- object->parent = NULL;
+ cookie = object->cookie;
+ set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
+ fscache_done_parent_op(object);
+ return transit_to(KILL_OBJECT);
+}
+
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x,%d,%d},%d",
+ object->debug_id, object->n_ops, object->n_children, event);
+
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ object->oob_event_mask = 0;
+
+ if (list_empty(&object->dependents) &&
+ object->n_ops == 0 &&
+ object->n_children == 0)
+ return transit_to(DROP_OBJECT);
+
+ if (object->n_in_progress == 0) {
+ spin_lock(&object->lock);
+ if (object->n_ops > 0 && object->n_in_progress == 0)
+ fscache_start_operations(object);
+ spin_unlock(&object->lock);
}
- /* this just shifts the object release to the work processor */
- fscache_put_object(object);
+ if (!list_empty(&object->dependents))
+ return transit_to(KILL_DEPENDENTS);
- _leave("");
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * release or recycle an object that the netfs has discarded
+ * Kill dependent objects.
*/
-static void fscache_release_object(struct fscache_object *object)
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+ int event)
{
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_drop_object(object);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+ return NO_TRANSIT; /* Not finished */
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * withdraw an object from active service
+ * Drop an object's attachments
*/
-static void fscache_withdraw_object(struct fscache_object *object)
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+ int event)
{
- struct fscache_cookie *cookie;
- bool detached;
+ struct fscache_object *parent = object->parent;
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_cache *cache = object->cache;
+ bool awaken = false;
- _enter("");
+ _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
- spin_lock(&object->lock);
- cookie = object->cookie;
- if (cookie) {
- /* need to get the cookie lock before the object lock, starting
- * from the object pointer */
- atomic_inc(&cookie->usage);
- spin_unlock(&object->lock);
+ ASSERT(cookie != NULL);
+ ASSERT(!hlist_unhashed(&object->cookie_link));
- detached = false;
- spin_lock(&cookie->lock);
- spin_lock(&object->lock);
+ /* Make sure the cookie no longer points here and that the netfs isn't
+ * waiting for us.
+ */
+ spin_lock(&cookie->lock);
+ hlist_del_init(&object->cookie_link);
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ awaken = true;
+ spin_unlock(&cookie->lock);
- if (object->cookie == cookie) {
- hlist_del_init(&object->cookie_link);
- object->cookie = NULL;
- fscache_invalidation_complete(cookie);
- detached = true;
- }
- spin_unlock(&cookie->lock);
- fscache_cookie_put(cookie);
- if (detached)
- fscache_cookie_put(cookie);
- }
+ if (awaken)
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ /* Prevent a race with our last child, which has to signal EV_CLEARED
+ * before dropping our spinlock.
+ */
+ spin_lock(&object->lock);
spin_unlock(&object->lock);
- fscache_drop_object(object);
-}
+ /* Discard from the cache's collection of objects */
+ spin_lock(&cache->object_list_lock);
+ list_del_init(&object->cache_link);
+ spin_unlock(&cache->object_list_lock);
-/*
- * withdraw an object from active service at the behest of the cache
- * - need break the links to a cached object cookie
- * - called under two situations:
- * (1) recycler decides to reclaim an in-use object
- * (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- * simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
- struct fscache_object *object)
-{
- bool enqueue = false;
+ fscache_stat(&fscache_n_cop_drop_object);
+ cache->ops->drop_object(object);
+ fscache_stat_d(&fscache_n_cop_drop_object);
- _enter(",OBJ%x", object->debug_id);
+ /* The parent object wants to know when all it dependents have gone */
+ if (parent) {
+ _debug("release parent OBJ%x {%d}",
+ parent->debug_id, parent->n_children);
- spin_lock(&object->lock);
- if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
- object->state = FSCACHE_OBJECT_WITHDRAWING;
- enqueue = true;
+ spin_lock(&parent->lock);
+ parent->n_children--;
+ if (parent->n_children == 0)
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+ spin_unlock(&parent->lock);
+ object->parent = NULL;
}
- spin_unlock(&object->lock);
- if (enqueue)
- fscache_enqueue_object(object);
+ /* this just shifts the object release to the work processor */
+ fscache_put_object(object);
+ fscache_stat(&fscache_n_object_dead);
_leave("");
+ return transit_to(OBJECT_DEAD);
}
/*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
}
/*
- * discard a ref on a work item
+ * Discard a ref on an object
*/
static void fscache_put_object(struct fscache_object *object)
{
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
fscache_stat_d(&fscache_n_cop_put_object);
}
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+ fscache_objlist_remove(object);
+
+ /* We can get rid of the cookie now */
+ fscache_cookie_put(object->cookie);
+ object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
/*
* enqueue an object for metadata-type processing
*/
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
/**
* fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
*
* Allow an object handler to sleep until the object workqueue is congested.
*
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
/*
- * enqueue the dependents of an object for metadata-type processing
- * - the caller must hold the object's lock
- * - this may cause an already locked object to wind up being processed again
+ * Enqueue the dependents of an object for metadata-type processing.
+ *
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately. We return true if the list was
+ * cleared.
*/
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
{
struct fscache_object *dep;
+ bool ret = true;
_enter("{OBJ%x}", object->debug_id);
if (list_empty(&object->dependents))
- return;
+ return true;
spin_lock(&object->lock);
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
struct fscache_object, dep_link);
list_del_init(&dep->dep_link);
-
- /* sort onto appropriate lists */
- fscache_enqueue_object(dep);
+ fscache_raise_event(dep, event);
fscache_put_object(dep);
- if (!list_empty(&object->dependents))
- cond_resched_lock(&object->lock);
+ if (!list_empty(&object->dependents) && need_resched()) {
+ ret = false;
+ break;
+ }
}
spin_unlock(&object->lock);
+ return ret;
}
/*
* remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
*/
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
{
_enter("{OBJ%x}", object->debug_id);
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
* @data: The auxiliary data for the object
* @datalen: The size of the auxiliary data
*
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
*/
enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
/*
* Asynchronously invalidate an object.
*/
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+ int event)
{
struct fscache_operation *op;
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ /* We're going to need the cookie. If the cookie is not available then
+ * retire the object instead.
+ */
+ if (!fscache_use_cookie(object)) {
+ ASSERT(object->cookie->stores.rnode == NULL);
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+ _leave(" [no cookie]");
+ return transit_to(KILL_OBJECT);
+ }
/* Reject any new read/write ops and abort any that are pending. */
fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* Now we have to wait for in-progress reads and writes */
op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op) {
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
- _leave(" [ENOMEM]");
- return;
- }
+ if (!op)
+ goto nomem;
fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
- op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+ op->flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_EXCLUSIVE) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
spin_lock(&cookie->lock);
if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* We can allow read and write requests to come in once again. They'll
* queue up behind our exclusive invalidation operation.
*/
- fscache_invalidation_complete(cookie);
- _leave("");
- return;
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ _leave(" [ok]");
+ return transit_to(UPDATE_OBJECT);
+
+nomem:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ fscache_unuse_cookie(object);
+ _leave(" [ENOMEM]");
+ return transit_to(KILL_OBJECT);
submit_op_failed:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
spin_unlock(&cookie->lock);
kfree(op);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
_leave(" [EIO]");
+ return transit_to(KILL_OBJECT);
+}
+
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+ int event)
+{
+ const struct fscache_state *s;
+
+ fscache_stat(&fscache_n_invalidates_run);
+ fscache_stat(&fscache_n_cop_invalidate_object);
+ s = _fscache_invalidate_object(object, event);
+ fscache_stat_d(&fscache_n_cop_invalidate_object);
+ return s;
+}
+
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ fscache_stat(&fscache_n_updates_run);
+ fscache_stat(&fscache_n_cop_update_object);
+ object->cache->ops->update_object(object);
+ fscache_stat_d(&fscache_n_cop_update_object);
+
+ _leave("");
+ return transit_to(WAIT_FOR_CMD);
}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
- ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+ ASSERT(fscache_object_is_available(op->object));
ASSERTCMP(atomic_read(&op->usage), >, 0);
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
/* need to issue a new write op after this */
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
*/
static void fscache_report_unexpected_submission(struct fscache_object *object,
struct fscache_operation *op,
- unsigned long ostate)
+ const struct fscache_state *ostate)
{
static bool once_only;
struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
once_only = true;
kdebug("unexpected submission OP%x [OBJ%x %s]",
- op->debug_id, object->debug_id,
- fscache_object_states[object->state]);
- kdebug("objstate=%s [%s]",
- fscache_object_states[object->state],
- fscache_object_states[ostate]);
+ op->debug_id, object->debug_id, object->state->name);
+ kdebug("objstate=%s [%s]", object->state->name, ostate->name);
kdebug("objflags=%lx", object->flags);
kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
int fscache_submit_op(struct fscache_object *object,
struct fscache_operation *op)
{
- unsigned long ostate;
+ const struct fscache_state *ostate;
int ret;
_enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
fscache_run_op(object, op);
}
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_DYING ||
- object->state == FSCACHE_OBJECT_LC_DYING ||
- object->state == FSCACHE_OBJECT_WITHDRAWING) {
+ } else if (fscache_object_is_dying(object)) {
fscache_stat(&fscache_n_op_rejected);
op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
}
/*
- * jump start the operation processing on an object
- * - caller must hold object->lock
+ * Jump start the operation processing on an object. The caller must hold
+ * object->lock.
*/
void fscache_start_operations(struct fscache_object *object)
{
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
object = op->object;
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
- if (atomic_dec_and_test(&object->n_reads)) {
- clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
- &object->cookie->flags);
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_WAITING_ON_READS);
- }
- }
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+ atomic_dec(&object->n_reads);
+ if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+ fscache_unuse_cookie(object);
/* now... we may get called with the object spinlock held, so we
* complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT)) {
+ if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat(&fscache_n_attr_changed_calls);
- if (fscache_object_is_active(object)) {
+ if (fscache_object_is_active(object) &&
+ fscache_use_cookie(object)) {
fscache_stat(&fscache_n_cop_attr_changed);
ret = object->cache->ops->attr_changed(object);
fscache_stat_d(&fscache_n_cop_attr_changed);
+ fscache_unuse_cookie(object);
if (ret < 0)
fscache_abort_object(object);
}
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
_enter("{OP%x}", op->op.debug_id);
- ASSERTCMP(op->n_pages, ==, 0);
+ ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
* allocate a retrieval op
*/
static struct fscache_retrieval *fscache_alloc_retrieval(
+ struct fscache_cookie *cookie,
struct address_space *mapping,
fscache_rw_complete_t end_io_func,
void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
}
fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
- op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+ atomic_inc(&cookie->n_active);
+ op->op.flags = FSCACHE_OP_MYTHREAD |
+ (1UL << FSCACHE_OP_WAITING) |
+ (1UL << FSCACHE_OP_UNUSE_COOKIE);
op->mapping = mapping;
op->end_io_func = end_io_func;
op->context = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
struct fscache_retrieval *op =
container_of(_op, struct fscache_retrieval, op);
- op->n_pages = 0;
+ atomic_set(&op->n_pages, 0);
}
/*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, page->mapping,
+ end_io_func,context);
if (!op) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
- ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+ ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
atomic_inc(&object->n_reads);
__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
if (!op)
return -ENOMEM;
- op->n_pages = *nr_pages;
+ atomic_set(&op->n_pages, *nr_pages);
spin_lock(&cookie->lock);
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+ op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -675,6 +684,7 @@ error:
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
*/
spin_unlock(&object->lock);
fscache_op_complete(&op->op, false);
- _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
- _op->flags, _op->state, object->state, object->flags);
+ _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
+ _op->flags, _op->state, object->state->short_name,
+ object->flags);
return;
}
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
_enter("");
- while (spin_lock(&cookie->stores_lock),
- n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
- ARRAY_SIZE(results),
- FSCACHE_COOKIE_PENDING_TAG),
- n > 0) {
+ for (;;) {
+ spin_lock(&cookie->stores_lock);
+ n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+ ARRAY_SIZE(results),
+ FSCACHE_COOKIE_PENDING_TAG);
+ if (n == 0) {
+ spin_unlock(&cookie->stores_lock);
+ break;
+ }
+
for (i = n - 1; i >= 0; i--) {
page = results[i];
radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
page_cache_release(results[i]);
}
- spin_unlock(&cookie->stores_lock);
_leave("");
}
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
* (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
* set)
*
- * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
- * fill op)
+ * (a) no writes yet
*
* (b) writes deferred till post-creation (mark page for writing and
* return immediately)
*
* (2) negative lookup, object created, initial fill being made from netfs
- * (FSCACHE_COOKIE_INITIAL_FILL is set)
*
* (a) fill point not yet reached this page (mark page for writing and
* return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_operation_init(&op->op, fscache_write_op,
fscache_release_write_op);
- op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+ op->op.flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_WAITING) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
op->store_limit = object->store_limit;
+ atomic_inc(&cookie->n_active);
if (fscache_submit_op(object, &op->op) < 0)
goto submit_failed;
@@ -945,6 +961,7 @@ already_pending:
return 0;
submit_failed:
+ atomic_dec(&cookie->n_active);
spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
spin_unlock(&cookie->stores_lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 254df56b847b..0eda52738ec4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,7 +14,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
-static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
struct fuse_conn *fc = get_fuse_conn(dir);
struct fuse_inode *fi = get_fuse_inode(dir);
@@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
return true;
if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
return true;
- if (filp->f_pos == 0)
+ if (ctx->pos == 0)
return true;
return false;
}
@@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
{
struct inode *inode;
+ struct dentry *parent;
+ struct fuse_conn *fc;
inode = ACCESS_ONCE(entry->d_inode);
if (inode && is_bad_inode(inode))
@@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
else if (fuse_dentry_time(entry) < get_jiffies_64()) {
int err;
struct fuse_entry_out outarg;
- struct fuse_conn *fc;
struct fuse_req *req;
struct fuse_forget_link *forget;
- struct dentry *parent;
u64 attr_version;
/* For negative dentries, always do a fresh lookup */
@@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
entry_attr_timeout(&outarg),
attr_version);
fuse_change_entry_timeout(entry, &outarg);
+ } else if (inode) {
+ fc = get_fuse_conn(inode);
+ if (fc->readdirplus_auto) {
+ parent = dget_parent(entry);
+ fuse_advise_use_readdirplus(parent->d_inode);
+ dput(parent);
+ }
}
- fuse_advise_use_readdirplus(inode);
return 1;
}
@@ -1159,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask)
}
static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
- void *dstbuf, filldir_t filldir)
+ struct dir_context *ctx)
{
while (nbytes >= FUSE_NAME_OFFSET) {
struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
size_t reclen = FUSE_DIRENT_SIZE(dirent);
- int over;
if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
return -EIO;
if (reclen > nbytes)
break;
- over = filldir(dstbuf, dirent->name, dirent->namelen,
- file->f_pos, dirent->ino, dirent->type);
- if (over)
+ if (!dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type))
break;
buf += reclen;
nbytes -= reclen;
- file->f_pos = dirent->off;
+ ctx->pos = dirent->off;
}
return 0;
@@ -1278,7 +1282,7 @@ out:
}
static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
- void *dstbuf, filldir_t filldir, u64 attr_version)
+ struct dir_context *ctx, u64 attr_version)
{
struct fuse_direntplus *direntplus;
struct fuse_dirent *dirent;
@@ -1303,10 +1307,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
we need to send a FORGET for each of those
which we did not link.
*/
- over = filldir(dstbuf, dirent->name, dirent->namelen,
- file->f_pos, dirent->ino,
- dirent->type);
- file->f_pos = dirent->off;
+ over = !dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type);
+ ctx->pos = dirent->off;
}
buf += reclen;
@@ -1320,7 +1323,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
return 0;
}
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
{
int plus, err;
size_t nbytes;
@@ -1343,17 +1346,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
return -ENOMEM;
}
- plus = fuse_use_readdirplus(inode, file);
+ plus = fuse_use_readdirplus(inode, ctx);
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
req->page_descs[0].length = PAGE_SIZE;
if (plus) {
attr_version = fuse_get_attr_version(fc);
- fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
- fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
FUSE_READDIR);
}
fuse_request_send(fc, req);
@@ -1363,11 +1366,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
if (!err) {
if (plus) {
err = parse_dirplusfile(page_address(page), nbytes,
- file, dstbuf, filldir,
+ file, ctx,
attr_version);
} else {
err = parse_dirfile(page_address(page), nbytes, file,
- dstbuf, filldir);
+ ctx);
}
}
@@ -1880,7 +1883,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
static const struct file_operations fuse_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = fuse_readdir,
+ .iterate = fuse_readdir,
.open = fuse_dir_open,
.release = fuse_dir_release,
.fsync = fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d1c9b85b3f58..35f281033142 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -16,6 +16,7 @@
#include <linux/compat.h>
#include <linux/swap.h>
#include <linux/aio.h>
+#include <linux/falloc.h>
static const struct file_operations fuse_direct_io_file_operations;
@@ -1278,7 +1279,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
iov_iter_init(&ii, iov, nr_segs, count, 0);
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ if (io->async)
+ req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+ else
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1314,7 +1318,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
break;
if (count) {
fuse_put_request(fc, req);
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ if (io->async)
+ req = fuse_get_req_for_background(fc,
+ fuse_iter_npages(&ii));
+ else
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
break;
}
@@ -2365,6 +2373,11 @@ static void fuse_do_truncate(struct file *file)
fuse_do_setattr(inode, &attr, file);
}
+static inline loff_t fuse_round_up(loff_t off)
+{
+ return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+}
+
static ssize_t
fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
@@ -2372,6 +2385,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ bool async_dio = ff->fc->async_dio;
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
@@ -2383,10 +2397,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
i_size = i_size_read(inode);
/* optimization for short read */
- if (rw != WRITE && offset + count > i_size) {
+ if (async_dio && rw != WRITE && offset + count > i_size) {
if (offset >= i_size)
return 0;
- count = i_size - offset;
+ count = min_t(loff_t, count, fuse_round_up(i_size - offset));
}
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
@@ -2404,7 +2418,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
- io->async = ff->fc->async_dio;
+ io->async = async_dio;
io->iocb = iocb;
/*
@@ -2412,7 +2426,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* to wait on real async I/O requests, so we must submit this request
* synchronously.
*/
- if (!is_sync_kiocb(iocb) && (offset + count > i_size))
+ if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
io->async = false;
if (rw == WRITE)
@@ -2424,7 +2438,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
- if (ret > 0 && !is_sync_kiocb(iocb))
+ if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
ret = wait_on_sync_kiocb(iocb);
@@ -2446,6 +2460,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
loff_t length)
{
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file->f_inode;
struct fuse_conn *fc = ff->fc;
struct fuse_req *req;
struct fuse_fallocate_in inarg = {
@@ -2455,13 +2470,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
.mode = mode
};
int err;
+ bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_PUNCH_HOLE);
if (fc->no_fallocate)
return -EOPNOTSUPP;
+ if (lock_inode) {
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_set_nowrite(inode);
+ }
+
req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
req->in.h.opcode = FUSE_FALLOCATE;
req->in.h.nodeid = ff->nodeid;
@@ -2476,6 +2501,25 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
}
fuse_put_request(fc, req);
+ if (err)
+ goto out;
+
+ /* we could have extended the file */
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ fuse_write_update_size(inode, offset + length);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ truncate_pagecache_range(inode, offset, offset + length - 1);
+
+ fuse_invalidate_attr(inode);
+
+out:
+ if (lock_inode) {
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_release_nowrite(inode);
+ mutex_unlock(&inode->i_mutex);
+ }
+
return err;
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6201f81e4d3a..9a0cdde14a08 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -867,10 +867,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->dont_mask = 1;
if (arg->flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1;
- if (arg->flags & FUSE_DO_READDIRPLUS)
+ if (arg->flags & FUSE_DO_READDIRPLUS) {
fc->do_readdirplus = 1;
- if (arg->flags & FUSE_READDIRPLUS_AUTO)
- fc->readdirplus_auto = 1;
+ if (arg->flags & FUSE_READDIRPLUS_AUTO)
+ fc->readdirplus_auto = 1;
+ }
if (arg->flags & FUSE_ASYNC_DIO)
fc->async_dio = 1;
} else {
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index eb08c9e43c2a..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
be found here: http://sources.redhat.com/cluster
The "nolock" lock module is now built in to GFS2 by default. If
- you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
- networking.
+ you want to use the DLM, be sure to enable IPv4/6 networking.
config GFS2_FS_LOCKING_DLM
bool "GFS2 DLM locking"
depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
- HOTPLUG && DLM && CONFIGFS_FS && SYSFS
+ CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
help
Multiple node locking module for GFS2
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_CACHE_SIZE-1);
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
goto out;
}
return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
/* Is the page fully outside i_size? (truncate in progress) */
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0,
+ PAGE_CACHE_SIZE);
unlock_page(page);
continue;
}
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
unlock_buffer(bh);
}
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
struct buffer_head *bh, *head;
unsigned long pos = 0;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
goto out;
bh = head = page_buffers(page);
do {
+ if (pos + bh->b_size > stop)
+ return;
+
if (offset <= pos)
gfs2_discard(sdp, bh);
pos += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
out:
- if (offset == 0)
+ if (!partial_page)
try_to_release_page(page, 0);
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1dc9a13ce6bb..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
unstuff = 1;
}
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+ error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+ (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+ 0 : RES_QUOTA), 0);
if (error)
goto do_grow_release;
@@ -1286,17 +1288,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
if (ret)
return ret;
+ ret = get_write_access(inode);
+ if (ret)
+ return ret;
+
inode_dio_wait(inode);
ret = gfs2_rs_alloc(GFS2_I(inode));
if (ret)
- return ret;
+ goto out;
oldsize = inode->i_size;
- if (newsize >= oldsize)
- return do_grow(inode, newsize);
+ if (newsize >= oldsize) {
+ ret = do_grow(inode, newsize);
+ goto out;
+ }
- return do_shrink(inode, oldsize, newsize);
+ ret = do_shrink(inode, oldsize, newsize);
+out:
+ put_write_access(inode);
+ return ret;
}
int gfs2_truncatei_resume(struct gfs2_inode *ip)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c3e82bd23179..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
return ERR_PTR(-EIO);
}
- hc = kmalloc(hsize, GFP_NOFS);
- ret = -ENOMEM;
+ hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN);
+ if (hc == NULL)
+ hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL);
+
if (hc == NULL)
return ERR_PTR(-ENOMEM);
ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret < 0) {
- kfree(hc);
+ if (is_vmalloc_addr(hc))
+ vfree(hc);
+ else
+ kfree(hc);
return ERR_PTR(ret);
}
spin_lock(&inode->i_lock);
- if (ip->i_hash_cache)
- kfree(hc);
- else
+ if (ip->i_hash_cache) {
+ if (is_vmalloc_addr(hc))
+ vfree(hc);
+ else
+ kfree(hc);
+ } else {
ip->i_hash_cache = hc;
+ }
spin_unlock(&inode->i_lock);
return ip->i_hash_cache;
@@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
__be64 *hc = ip->i_hash_cache;
ip->i_hash_cache = NULL;
- kfree(hc);
+ if (is_vmalloc_addr(hc))
+ vfree(hc);
+ else
+ kfree(hc);
}
static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -1113,10 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
if (IS_ERR(hc))
return PTR_ERR(hc);
- h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
+ hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+ if (hc2 == NULL)
+ hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
+
if (!hc2)
return -ENOMEM;
+ h = hc2;
error = gfs2_meta_inode_buffer(dip, &dibh);
if (error)
goto out_kfree;
@@ -1145,7 +1161,10 @@ fail:
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
out_kfree:
- kfree(hc2);
+ if (is_vmalloc_addr(hc2))
+ vfree(hc2);
+ else
+ kfree(hc2);
return error;
}
@@ -1194,9 +1213,7 @@ static int compare_dents(const void *a, const void *b)
/**
* do_filldir_main - read out directory entries
* @dip: The GFS2 inode
- * @offset: The offset in the file to read from
- * @opaque: opaque data to pass to filldir
- * @filldir: The function to pass entries to
+ * @ctx: what to feed the entries to
* @darr: an array of struct gfs2_dirent pointers to read
* @entries: the number of entries in darr
* @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1206,11 +1223,10 @@ static int compare_dents(const void *a, const void *b)
* the possibility that they will fall into different readdir buffers or
* that someone will want to seek to that location.
*
- * Returns: errno, >0 on exception from filldir
+ * Returns: errno, >0 if the actor tells you to stop
*/
-static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
- void *opaque, filldir_t filldir,
+static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
const struct gfs2_dirent **darr, u32 entries,
int *copied)
{
@@ -1218,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
u64 off, off_next;
unsigned int x, y;
int run = 0;
- int error = 0;
sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
@@ -1235,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
off_next = be32_to_cpu(dent_next->de_hash);
off_next = gfs2_disk_hash2offset(off_next);
- if (off < *offset)
+ if (off < ctx->pos)
continue;
- *offset = off;
+ ctx->pos = off;
if (off_next == off) {
if (*copied && !run)
@@ -1246,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
} else
run = 0;
} else {
- if (off < *offset)
+ if (off < ctx->pos)
continue;
- *offset = off;
+ ctx->pos = off;
}
- error = filldir(opaque, (const char *)(dent + 1),
+ if (!dir_emit(ctx, (const char *)(dent + 1),
be16_to_cpu(dent->de_name_len),
- off, be64_to_cpu(dent->de_inum.no_addr),
- be16_to_cpu(dent->de_type));
- if (error)
+ be64_to_cpu(dent->de_inum.no_addr),
+ be16_to_cpu(dent->de_type)))
return 1;
*copied = 1;
}
- /* Increment the *offset by one, so the next time we come into the
+ /* Increment the ctx->pos by one, so the next time we come into the
do_filldir fxn, we get the next entry instead of the last one in the
current leaf */
- (*offset)++;
+ ctx->pos++;
return 0;
}
@@ -1289,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr)
kfree(ptr);
}
-static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, int *copied, unsigned *depth,
+static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
+ int *copied, unsigned *depth,
u64 leaf_no)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -1368,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
} while(lfn);
BUG_ON(entries2 != entries);
- error = do_filldir_main(ip, offset, opaque, filldir, darr,
- entries, copied);
+ error = do_filldir_main(ip, ctx, darr, entries, copied);
out_free:
for(i = 0; i < leaf; i++)
brelse(larr[i]);
@@ -1428,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
* @dip: dinode pointer
- * @offset: the hash of the last entry read shifted to the right once
- * @opaque: buffer for the filldir function to fill
- * @filldir: points to the filldir function to use
+ * @ctx: actor to feed the entries to
*
* Returns: errno
*/
-static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, struct file_ra_state *f_ra)
+static int dir_e_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
u32 hsize, len = 0;
@@ -1447,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
unsigned depth = 0;
hsize = 1 << dip->i_depth;
- hash = gfs2_dir_offset2hash(*offset);
+ hash = gfs2_dir_offset2hash(ctx->pos);
index = hash >> (32 - dip->i_depth);
if (dip->i_hash_cache == NULL)
@@ -1459,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
gfs2_dir_readahead(inode, hsize, index, f_ra);
while (index < hsize) {
- error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+ error = gfs2_dir_read_leaf(inode, ctx,
&copied, &depth,
be64_to_cpu(lp[index]));
if (error)
@@ -1474,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
return error;
}
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, struct file_ra_state *f_ra)
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1489,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
return 0;
if (dip->i_diskflags & GFS2_DIF_EXHASH)
- return dir_e_read(inode, offset, opaque, filldir, f_ra);
+ return dir_e_read(inode, ctx, f_ra);
if (!gfs2_is_stuffed(dip)) {
gfs2_consist_inode(dip);
@@ -1521,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
error = -EIO;
goto out;
}
- error = do_filldir_main(dip, offset, opaque, filldir, darr,
+ error = do_filldir_main(dip, ctx, darr,
dip->i_entries, &copied);
out:
kfree(darr);
@@ -1537,9 +1548,9 @@ out:
/**
* gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
- * @filename:
- * @inode:
+ * @dip: The GFS2 dir inode
+ * @name: The name we are looking up
+ * @fail_on_exist: Fail if the name exists rather than looking it up
*
* This routine searches a directory for a file or another directory.
* Assumes a glock is held on dip.
@@ -1547,22 +1558,25 @@ out:
* Returns: errno
*/
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+ bool fail_on_exist)
{
struct buffer_head *bh;
struct gfs2_dirent *dent;
- struct inode *inode;
+ u64 addr, formal_ino;
+ u16 dtype;
dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
if (dent) {
if (IS_ERR(dent))
return ERR_CAST(dent);
- inode = gfs2_inode_lookup(dir->i_sb,
- be16_to_cpu(dent->de_type),
- be64_to_cpu(dent->de_inum.no_addr),
- be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+ dtype = be16_to_cpu(dent->de_type);
+ addr = be64_to_cpu(dent->de_inum.no_addr);
+ formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
brelse(bh);
- return inode;
+ if (fail_on_exist)
+ return ERR_PTR(-EEXIST);
+ return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
}
return ERR_PTR(-ENOENT);
}
@@ -1846,6 +1860,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
ht = kzalloc(size, GFP_NOFS);
+ if (ht == NULL)
+ ht = vzalloc(size);
if (!ht)
return -ENOMEM;
@@ -1933,7 +1949,10 @@ out_rlist:
gfs2_rlist_free(&rlist);
gfs2_quota_unhold(dip);
out:
- kfree(ht);
+ if (is_vmalloc_addr(ht))
+ vfree(ht);
+ else
+ kfree(ht);
return error;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960beab35..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,14 +18,15 @@ struct gfs2_inode;
struct gfs2_inum;
extern struct inode *gfs2_dir_search(struct inode *dir,
- const struct qstr *filename);
+ const struct qstr *filename,
+ bool fail_on_exist);
extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, struct file_ra_state *f_ra);
+extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra);
extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
const struct gfs2_inode *nip, unsigned int new_type);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9973df4ff565..8b9b3775e2e7 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
}
struct get_name_filldir {
+ struct dir_context ctx;
struct gfs2_inum_host inum;
char *name;
};
@@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name,
struct inode *dir = parent->d_inode;
struct inode *inode = child->d_inode;
struct gfs2_inode *dip, *ip;
- struct get_name_filldir gnfd;
+ struct get_name_filldir gnfd = {
+ .ctx.actor = get_name_filldir,
+ .name = name
+ };
struct gfs2_holder gh;
- u64 offset = 0;
int error;
struct file_ra_state f_ra = { .start = 0 };
@@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name,
*name = 0;
gnfd.inum.no_addr = ip->i_no_addr;
gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
- gnfd.name = name;
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
if (error)
return error;
- error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
+ error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
gfs2_glock_dq_uninit(&gh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index acd16764b133..f99f9e8a325f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -82,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
}
/**
- * gfs2_readdir - Read directory entries from a directory
+ * gfs2_readdir - Iterator for a directory
* @file: The directory to read from
- * @dirent: Buffer for dirents
- * @filldir: Function used to do the copying
+ * @ctx: What to feed directory entries to
*
* Returns: errno
*/
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int gfs2_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *dir = file->f_mapping->host;
struct gfs2_inode *dip = GFS2_I(dir);
struct gfs2_holder d_gh;
- u64 offset = file->f_pos;
int error;
- gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
- error = gfs2_glock_nq(&d_gh);
- if (error) {
- gfs2_holder_uninit(&d_gh);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+ if (error)
return error;
- }
- error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
+ error = gfs2_dir_read(dir, ctx, &file->f_ra);
gfs2_glock_dq_uninit(&d_gh);
- file->f_pos = offset;
-
return error;
}
@@ -402,16 +395,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Update file times before taking page lock */
file_update_time(vma->vm_file);
+ ret = get_write_access(inode);
+ if (ret)
+ goto out;
+
ret = gfs2_rs_alloc(ip);
if (ret)
- return ret;
+ goto out_write_access;
gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
- goto out;
+ goto out_uninit;
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
@@ -480,12 +477,15 @@ out_quota_unlock:
gfs2_quota_unlock(ip);
out_unlock:
gfs2_glock_dq(&gh);
-out:
+out_uninit:
gfs2_holder_uninit(&gh);
if (ret == 0) {
set_page_dirty(page);
wait_for_stable_page(page);
}
+out_write_access:
+ put_write_access(inode);
+out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
}
@@ -531,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
}
/**
- * gfs2_open - open a file
- * @inode: the inode to open
- * @file: the struct file for this opening
+ * gfs2_open_common - This is common to open and atomic_open
+ * @inode: The inode being opened
+ * @file: The file being opened
*
- * Returns: errno
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
*/
-static int gfs2_open(struct inode *inode, struct file *file)
+int gfs2_open_common(struct inode *inode, struct file *file)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder i_gh;
struct gfs2_file *fp;
- int error;
+ int ret;
+
+ if (S_ISREG(inode->i_mode)) {
+ ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+ }
- fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+ fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
if (!fp)
return -ENOMEM;
@@ -553,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
file->private_data = fp;
+ return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+ bool need_unlock = false;
if (S_ISREG(ip->i_inode.i_mode)) {
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (error)
- goto fail;
+ return error;
+ need_unlock = true;
+ }
- if (!(file->f_flags & O_LARGEFILE) &&
- i_size_read(inode) > MAX_NON_LFS) {
- error = -EOVERFLOW;
- goto fail_gunlock;
- }
+ error = gfs2_open_common(inode, file);
+ if (need_unlock)
gfs2_glock_dq_uninit(&i_gh);
- }
-
- return 0;
-fail_gunlock:
- gfs2_glock_dq_uninit(&i_gh);
-fail:
- file->private_data = NULL;
- kfree(fp);
return error;
}
@@ -594,10 +617,10 @@ static int gfs2_release(struct inode *inode, struct file *file)
kfree(file->private_data);
file->private_data = NULL;
- if ((file->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
- gfs2_rs_delete(ip);
+ if (!(file->f_mode & FMODE_WRITE))
+ return 0;
+ gfs2_rs_delete(ip);
return 0;
}
@@ -1041,7 +1064,7 @@ const struct file_operations gfs2_file_fops = {
};
const struct file_operations gfs2_dir_fops = {
- .readdir = gfs2_readdir,
+ .iterate = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
@@ -1071,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = {
};
const struct file_operations gfs2_dir_fops_nolock = {
- .readdir = gfs2_readdir,
+ .iterate = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c97571..5f2e5224c51c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
- sector_t blocknr;
gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
continue;
gfs2_ail_error(gl, bh);
}
- blocknr = bh->b_blocknr;
- bh->b_private = NULL;
- gfs2_remove_from_ail(bd); /* drops ref on bh */
-
- bd->bd_bh = NULL;
- bd->bd_blkno = blocknr;
-
gfs2_trans_add_revoke(sdp, bd);
}
GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8833a4f264e3..bbb2715171cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
return inode;
fail_refresh:
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_iopen:
@@ -312,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
goto out;
}
- inode = gfs2_dir_search(dir, name);
+ inode = gfs2_dir_search(dir, name, false);
if (IS_ERR(inode))
error = PTR_ERR(inode);
out:
@@ -345,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
if (!dip->i_inode.i_nlink)
return -ENOENT;
- error = gfs2_dir_check(&dip->i_inode, name, NULL);
- switch (error) {
- case -ENOENT:
- error = 0;
- break;
- case 0:
- return -EEXIST;
- default:
- return error;
- }
-
if (dip->i_entries == (u32)-1)
return -EFBIG;
if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -545,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
* gfs2_create_inode - Create a new inode
* @dir: The parent directory
* @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
* @mode: The permissions on the new inode
* @dev: For device nodes, this is the device number
* @symname: For symlinks, this is the link destination
@@ -554,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
*/
static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+ struct file *file,
umode_t mode, dev_t dev, const char *symname,
- unsigned int size, int excl)
+ unsigned int size, int excl, int *opened)
{
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
@@ -563,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
+ struct dentry *d;
int error;
u32 aflags = 0;
int arq;
@@ -583,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail;
error = create_ok(dip, name, mode);
- if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- gfs2_glock_dq_uninit(ghs);
- d_instantiate(dentry, inode);
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
- }
if (error)
goto fail_gunlock;
+ inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+ error = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ d = d_splice_alias(inode, dentry);
+ error = 0;
+ if (file && !IS_ERR(d)) {
+ if (d == NULL)
+ d = dentry;
+ if (S_ISREG(inode->i_mode))
+ error = finish_open(file, d, gfs2_open_common, opened);
+ else
+ error = finish_no_open(file, d);
+ }
+ gfs2_glock_dq_uninit(ghs);
+ if (IS_ERR(d))
+ return PTR_RET(d);
+ return error;
+ } else if (error != -ENOENT) {
+ goto fail_gunlock;
+ }
+
arq = error = gfs2_diradd_alloc_required(dir, name);
if (error < 0)
goto fail_gunlock;
@@ -685,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_gunlock3;
mark_inode_dirty(inode);
+ d_instantiate(dentry, inode);
+ if (file)
+ error = finish_open(file, dentry, gfs2_open_common, opened);
gfs2_glock_dq_uninit(ghs);
gfs2_glock_dq_uninit(ghs + 1);
- d_instantiate(dentry, inode);
- return 0;
+ return error;
fail_gunlock3:
gfs2_glock_dq_uninit(ghs + 1);
@@ -728,36 +738,56 @@ fail:
static int gfs2_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
}
/**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
* @dir: The directory inode
* @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
*
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
*
* Returns: errno
*/
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ struct file *file, int *opened)
{
- struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode && !IS_ERR(inode)) {
- struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
- struct gfs2_holder gh;
- int error;
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error) {
- iput(inode);
- return ERR_PTR(error);
- }
- gfs2_glock_dq_uninit(&gh);
+ struct inode *inode;
+ struct dentry *d;
+ struct gfs2_holder gh;
+ struct gfs2_glock *gl;
+ int error;
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ if (!inode)
+ return NULL;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ gl = GFS2_I(inode)->i_gl;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error) {
+ iput(inode);
+ return ERR_PTR(error);
}
- return d_splice_alias(inode, dentry);
+
+ d = d_splice_alias(inode, dentry);
+ if (file && S_ISREG(inode->i_mode))
+ error = finish_open(file, dentry, gfs2_open_common, opened);
+
+ gfs2_glock_dq_uninit(&gh);
+ if (error)
+ return ERR_PTR(error);
+ return d;
+}
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned flags)
+{
+ return __gfs2_lookup(dir, dentry, NULL, NULL);
}
/**
@@ -1075,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
return -ENAMETOOLONG;
- return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
}
/**
@@ -1091,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct gfs2_sbd *sdp = GFS2_SB(dir);
unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
- return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
}
/**
@@ -1106,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t dev)
{
- return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+ return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags,
+ umode_t mode, int *opened)
+{
+ struct dentry *d;
+ bool excl = !!(flags & O_EXCL);
+
+ d = __gfs2_lookup(dir, dentry, file, opened);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ if (d == NULL)
+ d = dentry;
+ if (d->d_inode) {
+ if (!(*opened & FILE_OPENED))
+ return finish_no_open(file, d);
+ return 0;
+ }
+
+ if (!(flags & O_CREAT))
+ return -ENOENT;
+
+ return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
}
/*
@@ -1786,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
.removexattr = gfs2_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
+ .atomic_open = gfs2_atomic_open,
};
const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f4853034..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr, *s;
+ int oldest_tr = 1;
int ret;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
gfs2_ail1_empty_one(sdp, tr);
- if (list_empty(&tr->tr_ail1_list))
+ if (list_empty(&tr->tr_ail1_list) && oldest_tr)
list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
- break;
+ oldest_tr = 0;
}
ret = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
- unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+ unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
unsigned wanted = blks + reserved_blks;
DEFINE_WAIT(wait);
int did_wait = 0;
@@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
spin_unlock(&sdp->sd_ordered_lock);
}
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+ struct buffer_head *bh = bd->bd_bh;
+ struct gfs2_glock *gl = bd->bd_gl;
+
+ gfs2_remove_from_ail(bd);
+ bd->bd_bh = NULL;
+ bh->b_private = NULL;
+ bd->bd_blkno = bh->b_blocknr;
+ bd->bd_ops = &gfs2_revoke_lops;
+ sdp->sd_log_num_revoke++;
+ atomic_inc(&gl->gl_revokes);
+ set_bit(GLF_LFLUSH, &gl->gl_flags);
+ list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+ struct gfs2_bufdata *bd, *tmp;
+ int have_revokes = 0;
+ int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+
+ gfs2_ail1_empty(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (list_empty(&bd->bd_list)) {
+ have_revokes = 1;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock(&sdp->sd_ail_lock);
+ if (have_revokes == 0)
+ return;
+ while (sdp->sd_log_num_revoke > max_revokes)
+ max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+ max_revokes -= sdp->sd_log_num_revoke;
+ if (!sdp->sd_log_num_revoke) {
+ atomic_dec(&sdp->sd_log_blks_free);
+ /* If no blocks have been reserved, we need to also
+ * reserve a block for the header */
+ if (!sdp->sd_log_blks_reserved)
+ atomic_dec(&sdp->sd_log_blks_free);
+ }
+ gfs2_log_lock(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (max_revokes == 0)
+ goto out_of_blocks;
+ if (!list_empty(&bd->bd_list))
+ continue;
+ gfs2_add_revoke(sdp, bd);
+ max_revokes--;
+ }
+ }
+out_of_blocks:
+ spin_unlock(&sdp->sd_ail_lock);
+ gfs2_log_unlock(sdp);
+
+ if (!sdp->sd_log_num_revoke) {
+ atomic_inc(&sdp->sd_log_blks_free);
+ if (!sdp->sd_log_blks_reserved)
+ atomic_inc(&sdp->sd_log_blks_free);
+ }
+}
+
/**
* log_write_header - Get and initialize a journal header buffer
* @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
lh = page_address(page);
clear_page(lh);
- gfs2_ail1_empty(sdp);
tail = current_tail(sdp);
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index c5fa758fd844..17c5b5d7dc88 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/fs.h>
+#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
@@ -212,7 +213,7 @@ static void gfs2_end_log_write(struct bio *bio, int error)
fs_err(sdp, "Error %d writing to log\n", error);
}
- bio_for_each_segment(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, error);
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
kunmap_atomic(kaddr);
}
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct gfs2_bufdata *bda, *bdb;
+
+ bda = list_entry(a, struct gfs2_bufdata, bd_list);
+ bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+
+ if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+ return -1;
+ if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+ return 1;
+ return 0;
+}
+
static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
unsigned int total, struct list_head *blist,
bool is_databuf)
@@ -413,13 +428,16 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
__be64 *ptr;
gfs2_log_lock(sdp);
+ list_sort(NULL, blist, blocknr_cmp);
bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
while(total) {
num = total;
if (total > limit)
num = limit;
gfs2_log_unlock(sdp);
- page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num);
+ page = gfs2_get_log_desc(sdp,
+ is_databuf ? GFS2_LOG_DESC_JDATA :
+ GFS2_LOG_DESC_METADATA, num + 1, num);
ld = page_address(page);
gfs2_log_lock(sdp);
ptr = (__be64 *)(ld + 1);
@@ -588,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
struct page *page;
unsigned int length;
+ gfs2_write_revokes(sdp);
if (!sdp->sd_log_num_revoke)
return;
@@ -834,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
.lo_name = "revoke",
};
-const struct gfs2_log_operations gfs2_rg_lops = {
- .lo_name = "rg",
-};
-
const struct gfs2_log_operations gfs2_databuf_lops = {
.lo_before_commit = databuf_lo_before_commit,
.lo_after_commit = databuf_lo_after_commit,
@@ -849,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
const struct gfs2_log_operations *gfs2_log_ops[] = {
&gfs2_databuf_lops,
&gfs2_buf_lops,
- &gfs2_rg_lops,
&gfs2_revoke_lops,
NULL,
};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e05c92..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
extern const struct gfs2_log_operations gfs2_glock_lops;
extern const struct gfs2_log_operations gfs2_buf_lops;
extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb68472..0da390686c08 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -296,10 +296,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
if (bd) {
spin_lock(&sdp->sd_ail_lock);
if (bd->bd_tr) {
- gfs2_remove_from_ail(bd);
- bh->b_private = NULL;
- bd->bd_bh = NULL;
- bd->bd_blkno = bh->b_blocknr;
gfs2_trans_add_revoke(sdp, bd);
}
spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
goto fail_quotad;
p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start logd thread: %d\n", error);
return error;
}
sdp->sd_logd_process = p;
p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start quotad thread: %d\n", error);
goto fail;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c7c840e916f8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -121,7 +121,7 @@ static u64 qd2index(struct gfs2_quota_data *qd)
{
struct kqid qid = qd->qd_id;
return (2 * (u64)from_kqid(&init_user_ns, qid)) +
- (qid.type == USRQUOTA) ? 0 : 1;
+ ((qid.type == USRQUOTA) ? 0 : 1);
}
static u64 qd2offset(struct gfs2_quota_data *qd)
@@ -721,7 +721,7 @@ get_a_page:
goto unlock_out;
}
- gfs2_trans_add_meta(ip->i_gl, bh);
+ gfs2_trans_add_data(ip->i_gl, bh);
kaddr = kmap_atomic(page);
if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
return error;
}
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
- return gfs2_quota_sync(sb, type);
-}
-
int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
{
struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
&tune->gt_statfs_quantum);
/* Update quota file */
- quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+ quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
/* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 0c5a575b513e..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -638,8 +638,10 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
*/
void gfs2_rs_delete(struct gfs2_inode *ip)
{
+ struct inode *inode = &ip->i_inode;
+
down_write(&ip->i_rw_mutex);
- if (ip->i_res) {
+ if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) {
gfs2_rs_deltree(ip->i_res);
BUG_ON(ip->i_res->rs_free);
kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
@@ -1286,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
minlen = max_t(u64, r.minlen,
q->limits.discard_granularity) >> bs_shift;
+ if (end <= start || minlen > sdp->sd_max_rg_data)
+ return -EINVAL;
+
rgd = gfs2_blk2rgrpd(sdp, start, 0);
- rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+ rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
- if (end <= start ||
- minlen > sdp->sd_max_rg_data ||
- start > rgd_end->rd_data0 + rgd_end->rd_data)
- return -EINVAL;
+ if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
+ && (start > rgd_end->rd_data0 + rgd_end->rd_data))
+ return -EINVAL; /* start is beyond the end of the fs */
while (1) {
@@ -1334,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
}
out:
- r.len = trimmed << 9;
+ r.len = trimmed << bs_shift;
if (copy_to_user(argp, &r, sizeof(r)))
return -EFAULT;
@@ -1401,9 +1405,14 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
u32 extlen;
u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
int ret;
+ struct inode *inode = &ip->i_inode;
- extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
- extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+ if (S_ISDIR(inode->i_mode))
+ extlen = 1;
+ else {
+ extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
+ extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+ }
if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
return;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 917c8e1eb4ae..e5639dec66c4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode)
/* Must not read inode block until block type has been verified */
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1514,8 +1515,10 @@ out_unlock:
if (gfs2_rs_active(ip->i_res))
gfs2_rs_deltree(ip->i_res);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq(&ip->i_iopen_gh);
+ }
gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_dq_uninit(&gh);
if (error && error != GLR_TRYFAILED && error != -EROFS)
@@ -1534,6 +1537,7 @@ out:
ip->i_gl = NULL;
if (ip->i_iopen_gh.gh_gl) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
}
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7374907742a8..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- struct gfs2_glock *gl = bd->bd_gl;
struct gfs2_trans *tr = current->journal_info;
BUG_ON(!list_empty(&bd->bd_list));
- BUG_ON(!list_empty(&bd->bd_ail_st_list));
- BUG_ON(!list_empty(&bd->bd_ail_gl_list));
- bd->bd_ops = &gfs2_revoke_lops;
+ gfs2_add_revoke(sdp, bd);
tr->tr_touched = 1;
tr->tr_num_revoke++;
- sdp->sd_log_num_revoke++;
- atomic_inc(&gl->gl_revokes);
- set_bit(GLF_LFLUSH, &gl->gl_flags);
- list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index f3b1a15ccd59..d3fa6bd9503e 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -415,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, num);
spin_unlock(&tree->hash_lock);
- BUG_ON(node);
+ if (node) {
+ pr_crit("new node %u already hashed?\n", num);
+ WARN_ON(1);
+ return node;
+ }
node = __hfs_bnode_create(tree, num);
if (!node)
return ERR_PTR(-ENOMEM);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e0101b6fb0d7..145566851e7a 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -51,9 +51,9 @@ done:
/*
* hfs_readdir
*/
-static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
int len, err;
char strbuf[HFS_MAX_NAMELEN];
@@ -62,7 +62,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct hfs_readdir_data *rd;
u16 type;
- if (filp->f_pos >= inode->i_size)
+ if (ctx->pos >= inode->i_size)
return 0;
err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
@@ -73,14 +73,13 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (err)
goto out;
- switch ((u32)filp->f_pos) {
- case 0:
+ if (ctx->pos == 0) {
/* This is completely artificial... */
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+ if (!dir_emit_dot(file, ctx))
goto out;
- filp->f_pos++;
- /* fall through */
- case 1:
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
err = -EIO;
goto out;
@@ -97,18 +96,16 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
// err = -EIO;
// goto out;
//}
- if (filldir(dirent, "..", 2, 1,
+ if (!dir_emit(ctx, "..", 2,
be32_to_cpu(entry.thread.ParID), DT_DIR))
goto out;
- filp->f_pos++;
- /* fall through */
- default:
- if (filp->f_pos >= inode->i_size)
- goto out;
- err = hfs_brec_goto(&fd, filp->f_pos - 1);
- if (err)
- goto out;
+ ctx->pos = 2;
}
+ if (ctx->pos >= inode->i_size)
+ goto out;
+ err = hfs_brec_goto(&fd, ctx->pos - 1);
+ if (err)
+ goto out;
for (;;) {
if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
@@ -131,7 +128,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = -EIO;
goto out;
}
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.dir.DirID), DT_DIR))
break;
} else if (type == HFS_CDR_FIL) {
@@ -140,7 +137,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = -EIO;
goto out;
}
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.file.FlNum), DT_REG))
break;
} else {
@@ -148,22 +145,22 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = -EIO;
goto out;
}
- filp->f_pos++;
- if (filp->f_pos >= inode->i_size)
+ ctx->pos++;
+ if (ctx->pos >= inode->i_size)
goto out;
err = hfs_brec_goto(&fd, 1);
if (err)
goto out;
}
- rd = filp->private_data;
+ rd = file->private_data;
if (!rd) {
rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
if (!rd) {
err = -ENOMEM;
goto out;
}
- filp->private_data = rd;
- rd->file = filp;
+ file->private_data = rd;
+ rd->file = file;
list_add(&rd->list, &HFS_I(inode)->open_dir_list);
}
memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -306,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
const struct file_operations hfs_dir_operations = {
.read = generic_read_dir,
- .readdir = hfs_readdir,
+ .iterate = hfs_readdir,
.llseek = generic_file_llseek,
.release = hfs_dir_release,
};
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index a37ac934732f..d8ce4bd17fc5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -121,9 +121,9 @@ fail:
return ERR_PTR(err);
}
-static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
int len, err;
char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct hfsplus_readdir_data *rd;
u16 type;
- if (filp->f_pos >= inode->i_size)
+ if (file->f_pos >= inode->i_size)
return 0;
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (err)
goto out;
- switch ((u32)filp->f_pos) {
- case 0:
+ if (ctx->pos == 0) {
/* This is completely artificial... */
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+ if (!dir_emit_dot(file, ctx))
goto out;
- filp->f_pos++;
- /* fall through */
- case 1:
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
err = -EIO;
goto out;
@@ -168,19 +167,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = -EIO;
goto out;
}
- if (filldir(dirent, "..", 2, 1,
+ if (!dir_emit(ctx, "..", 2,
be32_to_cpu(entry.thread.parentID), DT_DIR))
goto out;
- filp->f_pos++;
- /* fall through */
- default:
- if (filp->f_pos >= inode->i_size)
- goto out;
- err = hfs_brec_goto(&fd, filp->f_pos - 1);
- if (err)
- goto out;
+ ctx->pos = 2;
}
-
+ if (ctx->pos >= inode->i_size)
+ goto out;
+ err = hfs_brec_goto(&fd, ctx->pos - 1);
+ if (err)
+ goto out;
for (;;) {
if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
pr_err("walked past end of dir\n");
@@ -211,7 +207,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
HFSPLUS_SB(sb)->hidden_dir->i_ino ==
be32_to_cpu(entry.folder.id))
goto next;
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.folder.id), DT_DIR))
break;
} else if (type == HFSPLUS_FILE) {
@@ -220,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = -EIO;
goto out;
}
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.file.id), DT_REG))
break;
} else {
@@ -229,22 +225,22 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto out;
}
next:
- filp->f_pos++;
- if (filp->f_pos >= inode->i_size)
+ ctx->pos++;
+ if (ctx->pos >= inode->i_size)
goto out;
err = hfs_brec_goto(&fd, 1);
if (err)
goto out;
}
- rd = filp->private_data;
+ rd = file->private_data;
if (!rd) {
rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
if (!rd) {
err = -ENOMEM;
goto out;
}
- filp->private_data = rd;
- rd->file = filp;
+ file->private_data = rd;
+ rd->file = file;
list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
}
memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
const struct file_operations hfsplus_dir_operations = {
.fsync = hfsplus_file_fsync,
.read = generic_read_dir,
- .readdir = hfsplus_readdir,
+ .iterate = hfsplus_readdir,
.unlocked_ioctl = hfsplus_ioctl,
.llseek = generic_file_llseek,
.release = hfsplus_dir_release,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 32f35f187989..cddb05217512 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = {
.show_options = hostfs_show_options,
};
-int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+int hostfs_readdir(struct file *file, struct dir_context *ctx)
{
void *dir;
char *name;
@@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
__putname(name);
if (dir == NULL)
return -error;
- next = file->f_pos;
+ next = ctx->pos;
while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
- error = (*filldir)(ent, name, len, file->f_pos,
- ino, type);
- if (error) break;
- file->f_pos = next;
+ if (!dir_emit(ctx, name, len, ino, type))
+ break;
+ ctx->pos = next;
}
close_dir(dir);
return 0;
@@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = {
static const struct file_operations hostfs_dir_fops = {
.llseek = generic_file_llseek,
- .readdir = hostfs_readdir,
+ .iterate = hostfs_readdir,
.read = generic_read_dir,
};
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 546f6d39713a..292b1acb9b81 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -33,36 +33,38 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
if (whence == SEEK_DATA || whence == SEEK_HOLE)
return -EINVAL;
+ mutex_lock(&i->i_mutex);
hpfs_lock(s);
/*printk("dir lseek\n");*/
if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
- mutex_lock(&i->i_mutex);
pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
while (pos != new_off) {
if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
else goto fail;
if (pos == 12) goto fail;
}
- mutex_unlock(&i->i_mutex);
+ hpfs_add_pos(i, &filp->f_pos);
ok:
+ filp->f_pos = new_off;
hpfs_unlock(s);
- return filp->f_pos = new_off;
-fail:
mutex_unlock(&i->i_mutex);
+ return new_off;
+fail:
/*printk("illegal lseek: %016llx\n", new_off);*/
hpfs_unlock(s);
+ mutex_unlock(&i->i_mutex);
return -ESPIPE;
}
-static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hpfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
struct quad_buffer_head qbh;
struct hpfs_dirent *de;
int lc;
- long old_pos;
+ loff_t next_pos;
unsigned char *tempname;
int c1, c2 = 0;
int ret = 0;
@@ -103,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
}
lc = hpfs_sb(inode->i_sb)->sb_lowercase;
- if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
- filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
+ if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
+ ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
goto out;
}
- if (filp->f_pos == 13) {
+ if (ctx->pos == 13) {
ret = -ENOENT;
goto out;
}
@@ -118,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
accepted by filldir, but what can I do?
maybe killall -9 ls helps */
if (hpfs_sb(inode->i_sb)->sb_chk)
- if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
+ if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
ret = -EFSERROR;
goto out;
}
- if (filp->f_pos == 12)
+ if (ctx->pos == 12)
goto out;
- if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) {
- printk("HPFS: warning: pos==%d\n",(int)filp->f_pos);
+ if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
+ printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
goto out;
}
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
goto out;
- filp->f_pos = 11;
+ ctx->pos = 11;
}
- if (filp->f_pos == 11) {
- if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0)
+ if (ctx->pos == 11) {
+ if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
goto out;
- filp->f_pos = 1;
+ ctx->pos = 1;
}
- if (filp->f_pos == 1) {
- filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
- hpfs_add_pos(inode, &filp->f_pos);
- filp->f_version = inode->i_version;
+ if (ctx->pos == 1) {
+ ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+ hpfs_add_pos(inode, &file->f_pos);
+ file->f_version = inode->i_version;
}
- old_pos = filp->f_pos;
- if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) {
+ next_pos = ctx->pos;
+ if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
+ ctx->pos = next_pos;
ret = -EIOERROR;
goto out;
}
@@ -152,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (hpfs_sb(inode->i_sb)->sb_chk) {
if (de->first && !de->last && (de->namelen != 2
|| de ->name[0] != 1 || de->name[1] != 1))
- hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+ hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
if (de->last && (de->namelen != 1 || de ->name[0] != 255))
- hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
+ hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
}
hpfs_brelse4(&qbh);
+ ctx->pos = next_pos;
goto again;
}
tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
- if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) {
- filp->f_pos = old_pos;
+ if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
if (tempname != de->name) kfree(tempname);
hpfs_brelse4(&qbh);
goto out;
}
+ ctx->pos = next_pos;
if (tempname != de->name) kfree(tempname);
hpfs_brelse4(&qbh);
}
@@ -320,7 +324,7 @@ const struct file_operations hpfs_dir_ops =
{
.llseek = hpfs_dir_lseek,
.read = generic_read_dir,
- .readdir = hpfs_readdir,
+ .iterate = hpfs_readdir,
.release = hpfs_dir_release,
.fsync = hpfs_file_fsync,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3027f4dbbab5..e4ba5fe4c3b5 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -109,10 +109,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
+ hpfs_lock(inode->i_sb);
+
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
hpfs_truncate(inode);
}
+
+ hpfs_unlock(inode->i_sb);
}
static int hpfs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index cd3e38972c86..fc90ab11c340 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -542,8 +542,8 @@ static const struct file_operations hppfs_file_fops = {
};
struct hppfs_dirent {
- void *vfs_dirent;
- filldir_t filldir;
+ struct dir_context ctx;
+ struct dir_context *caller;
struct dentry *dentry;
};
@@ -555,34 +555,29 @@ static int hppfs_filldir(void *d, const char *name, int size,
if (file_removed(dirent->dentry, name))
return 0;
- return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
- inode, type);
+ dirent->caller->pos = dirent->ctx.pos;
+ return !dir_emit(dirent->caller, name, size, inode, type);
}
-static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+static int hppfs_readdir(struct file *file, struct dir_context *ctx)
{
struct hppfs_private *data = file->private_data;
struct file *proc_file = data->proc_file;
- int (*readdir)(struct file *, void *, filldir_t);
- struct hppfs_dirent dirent = ((struct hppfs_dirent)
- { .vfs_dirent = ent,
- .filldir = filldir,
- .dentry = file->f_path.dentry
- });
+ struct hppfs_dirent d = {
+ .ctx.actor = hppfs_filldir,
+ .caller = ctx,
+ .dentry = file->f_path.dentry
+ };
int err;
-
- readdir = file_inode(proc_file)->i_fop->readdir;
-
- proc_file->f_pos = file->f_pos;
- err = (*readdir)(proc_file, &dirent, hppfs_filldir);
- file->f_pos = proc_file->f_pos;
-
+ proc_file->f_pos = ctx->pos;
+ err = iterate_dir(proc_file, &d.ctx);
+ ctx->pos = d.ctx.pos;
return err;
}
static const struct file_operations hppfs_dir_fops = {
.owner = NULL,
- .readdir = hppfs_readdir,
+ .iterate = hppfs_readdir,
.open = hppfs_dir_open,
.llseek = default_llseek,
.release = hppfs_release,
diff --git a/fs/internal.h b/fs/internal.h
index eaa75f75b625..68121584ae37 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -132,6 +132,12 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
/*
+ * splice.c
+ */
+extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+ loff_t *opos, size_t len, unsigned int flags);
+
+/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index a7d5c3c3d4e6..b943cbd963bb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de,
/*
* This should _really_ be cleaned up some day..
*/
-static int do_isofs_readdir(struct inode *inode, struct file *filp,
- void *dirent, filldir_t filldir,
+static int do_isofs_readdir(struct inode *inode, struct file *file,
+ struct dir_context *ctx,
char *tmpname, struct iso_directory_record *tmpde)
{
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
struct iso_directory_record *de;
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- offset = filp->f_pos & (bufsize - 1);
- block = filp->f_pos >> bufbits;
+ offset = ctx->pos & (bufsize - 1);
+ block = ctx->pos >> bufbits;
- while (filp->f_pos < inode->i_size) {
+ while (ctx->pos < inode->i_size) {
int de_len;
if (!bh) {
@@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
de = (struct iso_directory_record *) (bh->b_data + offset);
- de_len = *(unsigned char *) de;
+ de_len = *(unsigned char *)de;
/*
* If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
if (de_len == 0) {
brelse(bh);
bh = NULL;
- filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
- block = filp->f_pos >> bufbits;
+ ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+ block = ctx->pos >> bufbits;
offset = 0;
continue;
}
@@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
if (de->flags[-sbi->s_high_sierra] & 0x80) {
first_de = 0;
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
first_de = 1;
/* Handle the case of the '.' directory */
if (de->name_len[0] == 1 && de->name[0] == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+ if (!dir_emit_dot(file, ctx))
break;
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
/* Handle the case of the '..' directory */
if (de->name_len[0] == 1 && de->name[0] == 1) {
- inode_number = parent_ino(filp->f_path.dentry);
- if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
+ if (!dir_emit_dotdot(file, ctx))
break;
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
(!sbi->s_showassoc &&
(de->flags[-sbi->s_high_sierra] & 4))) {
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
}
}
if (len > 0) {
- if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
+ if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
break;
}
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
* handling split directory entries.. The real work is done by
* "do_isofs_readdir()".
*/
-static int isofs_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int isofs_readdir(struct file *file, struct dir_context *ctx)
{
int result;
char *tmpname;
struct iso_directory_record *tmpde;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
tmpname = (char *)__get_free_page(GFP_KERNEL);
if (tmpname == NULL)
@@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp,
tmpde = (struct iso_directory_record *) (tmpname+1024);
- result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
+ result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
free_page((unsigned long) tmpname);
return result;
@@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = isofs_readdir,
+ .iterate = isofs_readdir,
};
/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
* void journal_invalidatepage() - invalidate a journal page
* @journal: journal to use for flush
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: offset of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
*/
void journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
may_free &= journal_unmap_buffer(journal, bh,
- offset > 0);
+ partial_page);
unlock_buffer(bh);
}
curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
config JBD2_DEBUG
bool "JBD2 (ext4) debugging support"
- depends on JBD2 && DEBUG_FS
+ depends on JBD2
help
If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
By default, the debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+ with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
number between 1 and 5. The higher the number, the more debugging
output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+ "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd_space_needed(journal);
- while (__jbd2_log_space_left(journal) < nblocks) {
+ nblocks = jbd2_space_needed(journal);
+ while (jbd2_log_space_left(journal) < nblocks) {
if (journal->j_flags & JBD2_ABORT)
return;
write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
*/
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
- nblocks = jbd_space_needed(journal);
- space_left = __jbd2_log_space_left(journal);
+ nblocks = jbd2_space_needed(journal);
+ space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
/* We were able to recover space; yay! */
;
} else if (tid) {
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set. So we need to temporarily drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
+ write_lock(&journal->j_state_lock);
+ continue;
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
-
- /* Just in case anybody was waiting for more transactions to be
- checkpointed... */
- wake_up(&journal->j_wait_logspace);
ret = 1;
out:
return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
- J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
- J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
#include <trace/events/jbd2.h>
/*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
*/
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
+ struct buffer_head *orig_bh = bh->b_private;
+
BUFFER_TRACE(bh, "");
if (uptodate)
set_buffer_uptodate(bh);
else
clear_buffer_uptodate(bh);
+ if (orig_bh) {
+ clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&orig_bh->b_state, BH_Shadow);
+ }
unlock_buffer(bh);
}
@@ -85,8 +92,7 @@ nope:
__brelse(bh);
}
-static void jbd2_commit_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
{
struct commit_header *h;
__u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+ h = (struct commit_header *)(bh->b_data);
h->h_chksum_type = 0;
h->h_chksum_size = 0;
h->h_chksum[0] = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
h->h_chksum[0] = cpu_to_be32(csum);
}
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head **cbh,
__u32 crc32_sum)
{
- struct journal_head *descriptor;
struct commit_header *tmp;
struct buffer_head *bh;
int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
if (is_journal_aborted(journal))
return 0;
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
- if (!descriptor)
+ bh = jbd2_journal_get_descriptor_buffer(journal);
+ if (!bh)
return 1;
- bh = jh2bh(descriptor);
-
tmp = (struct commit_header *)bh->b_data;
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
- jbd2_commit_block_csum_set(journal, descriptor);
+ jbd2_commit_block_csum_set(journal, bh);
- JBUFFER_TRACE(descriptor, "submit commit block");
+ BUFFER_TRACE(bh, "submit commit block");
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
if (unlikely(!buffer_uptodate(bh)))
ret = -EIO;
put_bh(bh); /* One for getblk() */
- jbd2_journal_put_journal_head(bh2jh(bh));
return ret;
}
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
}
static void jbd2_descr_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+ struct buffer_head *bh)
{
struct jbd2_journal_block_tail *tail;
__u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_block_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
{
struct page *page = bh->b_page;
__u8 *addr;
- __u32 csum;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
sequence = cpu_to_be32(sequence);
addr = kmap_atomic(page);
- csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
- bh->b_size);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+ bh->b_size);
kunmap_atomic(addr);
- tag->t_checksum = cpu_to_be32(csum);
+ /* We only have space to store the lower 16 bits of the crc32c. */
+ tag->t_checksum = cpu_to_be16(csum32);
}
/*
* jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
{
struct transaction_stats_s stats;
transaction_t *commit_transaction;
- struct journal_head *jh, *new_jh, *descriptor;
+ struct journal_head *jh;
+ struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tid_t first_tid;
int update_tail;
int csum_size = 0;
+ LIST_HEAD(io_bufs);
+ LIST_HEAD(log_bufs);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
trace_jbd2_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ /*
+ * Reserved credits cannot be claimed anymore, free them
+ */
+ atomic_sub(atomic_read(&journal->j_reserved_credits),
+ &commit_transaction->t_outstanding_credits);
+
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(journal, commit_transaction,
- WRITE_SYNC);
+ &log_bufs, WRITE_SYNC);
blk_finish_plug(&plug);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
atomic_read(&commit_transaction->t_outstanding_credits));
err = 0;
- descriptor = NULL;
bufs = 0;
+ descriptor = NULL;
blk_start_plug(&plug);
while (commit_transaction->t_buffers) {
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
record the metadata buffer. */
if (!descriptor) {
- struct buffer_head *bh;
-
J_ASSERT (bufs == 0);
jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- bh = jh2bh(descriptor);
jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
+ (unsigned long long)descriptor->b_blocknr,
+ descriptor->b_data);
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
+ tagp = &descriptor->b_data[sizeof(journal_header_t)];
+ space_left = descriptor->b_size -
+ sizeof(journal_header_t);
first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
+ set_buffer_jwrite(descriptor);
+ set_buffer_dirty(descriptor);
+ wbuf[bufs++] = descriptor;
/* Record it so that we can wait for IO
completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- jbd2_journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+ jbd2_file_log_bh(&log_bufs, descriptor);
}
/* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Bump b_count to prevent truncate from stumbling over
the shadowed buffer! @@@ This can go if we ever get
- rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ rid of the shadow pairing of buffers. */
atomic_inc(&jh2bh(jh)->b_count);
- /* Make a temporary IO buffer with which to write it out
- (this will requeue both the metadata buffer and the
- temporary IO buffer). new_bh goes on BJ_IO*/
-
- set_bit(BH_JWrite, &jh2bh(jh)->b_state);
/*
- * akpm: jbd2_journal_write_metadata_buffer() sets
- * new_bh->b_transaction to commit_transaction.
- * We need to clean this up before we release new_bh
- * (which is of type BJ_IO)
+ * Make a temporary IO buffer with which to write it out
+ * (this will requeue the metadata buffer to BJ_Shadow).
*/
+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
- jh, &new_jh, blocknr);
+ jh, &wbuf[bufs], blocknr);
if (flags < 0) {
jbd2_journal_abort(journal, flags);
continue;
}
- set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
- wbuf[bufs++] = jh2bh(new_jh);
+ jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag = (journal_block_tag_t *) tagp;
write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
- jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+ jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
tagp += tag_bytes;
space_left -= tag_bytes;
+ bufs++;
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
+ the io_bufs list.
Wait for the buffers in reverse order. That way we are
less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
jbd_debug(3, "JBD2: commit phase 3\n");
- /*
- * akpm: these are BJ_IO, and j_list_lock is not needed.
- * See __journal_try_to_free_buffer.
- */
-wait_for_iobuf:
- while (commit_transaction->t_iobuf_list != NULL) {
- struct buffer_head *bh;
+ while (!list_empty(&io_bufs)) {
+ struct buffer_head *bh = list_entry(io_bufs.prev,
+ struct buffer_head,
+ b_assoc_buffers);
- jh = commit_transaction->t_iobuf_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_iobuf;
- }
- if (cond_resched())
- goto wait_for_iobuf;
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
-
- clear_buffer_jwrite(bh);
-
- JBUFFER_TRACE(jh, "ph4: unfile after journal write");
- jbd2_journal_unfile_buffer(journal, jh);
+ jbd2_unfile_log_bh(bh);
/*
- * ->t_iobuf_list should contain only dummy buffer_heads
- * which were created by jbd2_journal_write_metadata_buffer().
+ * The list contains temporary buffer heads created by
+ * jbd2_journal_write_metadata_buffer().
*/
BUFFER_TRACE(bh, "dumping temporary bh");
- jbd2_journal_put_journal_head(jh);
__brelse(bh);
J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
free_buffer_head(bh);
- /* We also have to unlock and free the corresponding
- shadowed buffer */
+ /* We also have to refile the corresponding shadowed buffer */
jh = commit_transaction->t_shadow_list->b_tprev;
bh = jh2bh(jh);
- clear_bit(BH_JWrite, &bh->b_state);
+ clear_buffer_jwrite(bh);
J_ASSERT_BH(bh, buffer_jbddirty(bh));
+ J_ASSERT_BH(bh, !buffer_shadow(bh));
/* The metadata is now released for reuse, but we need
to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /*
- * Wake up any transactions which were waiting for this IO to
- * complete. The barrier must be here so that changes by
- * jbd2_journal_file_buffer() take effect before wake_up_bit()
- * does the waitqueue check.
- */
- smp_mb();
- wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
}
@@ -883,26 +862,19 @@ wait_for_iobuf:
jbd_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
+ while (!list_empty(&log_bufs)) {
struct buffer_head *bh;
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
+ bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
- jbd2_journal_unfile_buffer(journal, jh);
- jbd2_journal_put_journal_head(jh);
+ jbd2_unfile_log_bh(bh);
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -952,9 +924,7 @@ wait_for_iobuf:
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
- J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
- J_ASSERT(commit_transaction->t_log_list == NULL);
restart_loop:
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (level > jbd2_journal_enable_debug)
+ return;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
/* Checksumming functions */
int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
*
* If the source buffer has already been modified by a new transaction
* since we took the last commit snapshot, we use the frozen copy of
- * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
*
- * The function returns a pointer to the buffer_heads to be used for IO.
- *
- * We assume that the journal has already been locked in this function.
+ * The function returns a pointer to the buffer_head to be used for IO.
+ *
*
* Return value:
* <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
- struct journal_head **jh_out,
- unsigned long long blocknr)
+ struct buffer_head **bh_out,
+ sector_t blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct journal_head *new_jh;
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+ jbd_lock_bh_state(bh_in);
+repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
- jbd_lock_bh_state(bh_in);
-repeat:
if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
- jbd2_journal_put_journal_head(new_jh);
+ brelse(new_bh);
return -ENOMEM;
}
jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
jh_in->b_frozen_data = tmp;
mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+ memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
kunmap_atomic(mapped_data);
new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
}
set_bh_page(new_bh, new_page, new_offset);
- new_jh->b_transaction = NULL;
- new_bh->b_size = jh2bh(jh_in)->b_size;
- new_bh->b_bdev = transaction->t_journal->j_dev;
+ new_bh->b_size = bh_in->b_size;
+ new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
+ new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
- *jh_out = new_jh;
+ *bh_out = new_bh;
/*
* The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
+ set_buffer_shadow(bh_in);
jbd_unlock_bh_state(bh_in);
- JBUFFER_TRACE(new_jh, "file as BJ_IO");
- jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
-
return do_escape | (done_copy_out << 1);
}
@@ -484,35 +496,6 @@ repeat:
*/
/*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __jbd2_log_space_left(journal_t *journal)
-{
- int left = journal->j_free;
-
- /* assert_spin_locked(&journal->j_state_lock); */
-
- /*
- * Be pessimistic here about the number of those free blocks which
- * might be required for log descriptor control blocks.
- */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
- left -= MIN_LOG_RESERVED_BLOCKS;
-
- if (left <= 0)
- return 0;
- left -= (left >> 3);
- return left;
-}
-
-/*
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
}
/*
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
+ * Force and wait any uncommitted transactions. We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ * 0 if nothing to commit,
+ * 1 if transaction was successfully committed.
*/
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
{
transaction_t *transaction = NULL;
tid_t tid;
- int need_to_start = 0;
+ int need_to_start = 0, ret = 0;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
transaction = journal->j_committing_transaction;
if (!transaction) {
+ /* Nothing to commit */
read_unlock(&journal->j_state_lock);
- return 0; /* Nothing to retry */
+ return 0;
}
-
tid = transaction->t_tid;
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
- jbd2_log_wait_commit(journal, tid);
- return 1;
+ ret = jbd2_log_wait_commit(journal, tid);
+ if (!ret)
+ ret = 1;
+
+ return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction. This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+ int ret;
+
+ ret = __jbd2_journal_force_commit(journal);
+ return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+ int ret;
+
+ J_ASSERT(!current->journal_info);
+ ret = __jbd2_journal_force_commit(journal);
+ if (ret > 0)
+ ret = 0;
+ return ret;
}
/*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
- return jbd2_journal_add_journal_head(bh);
+ return bh;
}
/*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
return NULL;
init_waitqueue_head(&journal->j_wait_transaction_locked);
- init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
- init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
+ init_waitqueue_head(&journal->j_wait_reserved);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
+ atomic_set(&journal->j_reserved_credits, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
static void jbd2_write_superblock(journal_t *journal, int write_op)
{
struct buffer_head *bh = journal->j_sb_buffer;
+ journal_superblock_t *sb = journal->j_superblock;
int ret;
trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
+ jbd2_superblock_csum_set(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
journal->j_errno);
sb->s_errno = cpu_to_be32(journal->j_errno);
- jbd2_superblock_csum_set(journal, sb);
read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
#ifdef CONFIG_JBD2_DEBUG
atomic_inc(&nr_journal_heads);
#endif
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
jbd_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
while (!ret) {
yield();
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
}
}
return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
struct journal_head *new_jh = NULL;
repeat:
- if (!buffer_jbd(bh)) {
+ if (!buffer_jbd(bh))
new_jh = journal_alloc_journal_head();
- memset(new_jh, 0, sizeof(*new_jh));
- }
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
- __u32 provided, calculated;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
sequence = cpu_to_be32(sequence);
- calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
- provided = be32_to_cpu(tag->t_checksum);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- return provided == cpu_to_be32(calculated);
+ return tag->t_checksum == cpu_to_be16(csum32);
}
static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
- struct journal_head **, int *,
+ struct list_head *,
+ struct buffer_head **, int *,
struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
*/
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction,
+ struct list_head *log_bufs,
int write_op)
{
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
- write_one_revoke_record(journal, transaction,
+ write_one_revoke_record(journal, transaction, log_bufs,
&descriptor, &offset,
record, write_op);
count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
- struct journal_head **descriptorp,
+ struct list_head *log_bufs,
+ struct buffer_head **descriptorp,
int *offsetp,
struct jbd2_revoke_record_s *record,
int write_op)
{
int csum_size = 0;
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
int offset;
journal_header_t *header;
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
- header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
- JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
- jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "file in log_bufs");
+ jbd2_file_log_bh(log_bufs, descriptor);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
- * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
offset += 8;
} else {
- * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
}
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
-static void jbd2_revoke_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
{
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_revoke_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_revoke_tail));
tail->r_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->r_checksum = cpu_to_be32(csum);
}
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
*/
static void flush_descriptor(journal_t *journal,
- struct journal_head *descriptor,
+ struct buffer_head *descriptor,
int offset, int write_op)
{
jbd2_journal_revoke_header_t *header;
- struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
- put_bh(bh);
+ put_bh(descriptor);
return;
}
- header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
jbd2_revoke_csum_set(journal, descriptor);
- set_buffer_jwrite(bh);
- BUFFER_TRACE(bh, "write");
- set_buffer_dirty(bh);
- write_dirty_buffer(bh, write_op);
+ set_buffer_jwrite(descriptor);
+ BUFFER_TRACE(descriptor, "write");
+ set_buffer_dirty(descriptor);
+ write_dirty_buffer(descriptor, write_op);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
- atomic_set(&transaction->t_outstanding_credits, 0);
+ atomic_set(&transaction->t_outstanding_credits,
+ atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
}
/*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+ __releases(journal->j_state_lock)
+{
+ DEFINE_WAIT(wait);
+ int need_to_start;
+ tid_t tid = journal->j_running_transaction->t_tid;
+
+ prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ need_to_start = !tid_geq(journal->j_commit_request, tid);
+ read_unlock(&journal->j_state_lock);
+ if (need_to_start)
+ jbd2_log_start_commit(journal, tid);
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+ atomic_sub(blocks, &journal->j_reserved_credits);
+ wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction. Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+ int rsv_blocks)
+{
+ transaction_t *t = journal->j_running_transaction;
+ int needed;
+ int total = blocks + rsv_blocks;
+
+ /*
+ * If the current transaction is locked down for commit, wait
+ * for the lock to be released.
+ */
+ if (t->t_state == T_LOCKED) {
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * If there is not enough space left in the log to write all
+ * potential buffers requested by this operation, we need to
+ * stall pending a log checkpoint to free some more log space.
+ */
+ needed = atomic_add_return(total, &t->t_outstanding_credits);
+ if (needed > journal->j_max_transaction_buffers) {
+ /*
+ * If the current transaction is already too large,
+ * then start to commit it: we can then go back and
+ * attach this handle to a new transaction.
+ */
+ atomic_sub(total, &t->t_outstanding_credits);
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * The commit code assumes that it can get enough log space
+ * without forcing a checkpoint. This is *critical* for
+ * correctness: a checkpoint of a buffer which is also
+ * associated with a committing transaction creates a deadlock,
+ * so commit simply cannot force through checkpoints.
+ *
+ * We must therefore ensure the necessary space in the journal
+ * *before* starting to dirty potentially checkpointed buffers
+ * in the new transaction.
+ */
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ __jbd2_log_wait_for_space(journal);
+ write_unlock(&journal->j_state_lock);
+ return 1;
+ }
+
+ /* No reservation? We are done... */
+ if (!rsv_blocks)
+ return 0;
+
+ needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+ /* We allow at most half of a transaction to be reserved */
+ if (needed > journal->j_max_transaction_buffers / 2) {
+ sub_reserved_credits(journal, rsv_blocks);
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) + rsv_blocks
+ <= journal->j_max_transaction_buffers / 2);
+ return 1;
+ }
+ return 0;
+}
+
+/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
* to begin. Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- tid_t tid;
- int needed, need_to_start;
- int nblocks = handle->h_buffer_credits;
+ int blocks = handle->h_buffer_credits;
+ int rsv_blocks = 0;
unsigned long ts = jiffies;
- if (nblocks > journal->j_max_transaction_buffers) {
+ /*
+ * 1/2 of transaction can be reserved so we can practically handle
+ * only 1/2 of maximum transaction size per operation
+ */
+ if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
- current->comm, nblocks,
- journal->j_max_transaction_buffers);
+ current->comm, blocks,
+ journal->j_max_transaction_buffers / 2);
return -ENOSPC;
}
+ if (handle->h_rsv_handle)
+ rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
alloc_transaction:
if (!journal->j_running_transaction) {
new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
return -EROFS;
}
- /* Wait on the journal's transaction barrier if necessary */
- if (journal->j_barrier_count) {
+ /*
+ * Wait on the journal's transaction barrier if necessary. Specifically
+ * we allow reserved handles to proceed because otherwise commit could
+ * deadlock on page writeback not being able to complete.
+ */
+ if (!handle->h_reserved && journal->j_barrier_count) {
read_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_transaction_locked,
journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
goto alloc_transaction;
write_lock(&journal->j_state_lock);
if (!journal->j_running_transaction &&
- !journal->j_barrier_count) {
+ (handle->h_reserved || !journal->j_barrier_count)) {
jbd2_get_transaction(journal, new_transaction);
new_transaction = NULL;
}
@@ -223,85 +340,18 @@ repeat:
transaction = journal->j_running_transaction;
- /*
- * If the current transaction is locked down for commit, wait for the
- * lock to be released.
- */
- if (transaction->t_state == T_LOCKED) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&journal->j_wait_transaction_locked,
- &wait, TASK_UNINTERRUPTIBLE);
- read_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * If there is not enough space left in the log to write all potential
- * buffers requested by this operation, we need to stall pending a log
- * checkpoint to free some more log space.
- */
- needed = atomic_add_return(nblocks,
- &transaction->t_outstanding_credits);
-
- if (needed > journal->j_max_transaction_buffers) {
+ if (!handle->h_reserved) {
+ /* We may have dropped j_state_lock - restart in that case */
+ if (add_transaction_credits(journal, blocks, rsv_blocks))
+ goto repeat;
+ } else {
/*
- * If the current transaction is already too large, then start
- * to commit it: we can then go back and attach this handle to
- * a new transaction.
+ * We have handle reserved so we are allowed to join T_LOCKED
+ * transaction and we don't have to check for transaction size
+ * and journal space.
*/
- DEFINE_WAIT(wait);
-
- jbd_debug(2, "Handle %p starting new commit...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
- TASK_UNINTERRUPTIBLE);
- tid = transaction->t_tid;
- need_to_start = !tid_geq(journal->j_commit_request, tid);
- read_unlock(&journal->j_state_lock);
- if (need_to_start)
- jbd2_log_start_commit(journal, tid);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * The commit code assumes that it can get enough log space
- * without forcing a checkpoint. This is *critical* for
- * correctness: a checkpoint of a buffer which is also
- * associated with a committing transaction creates a deadlock,
- * so commit simply cannot force through checkpoints.
- *
- * We must therefore ensure the necessary space in the journal
- * *before* starting to dirty potentially checkpointed buffers
- * in the new transaction.
- *
- * The worst part is, any transaction currently committing can
- * reduce the free space arbitrarily. Be careful to account for
- * those buffers when checkpointing.
- */
-
- /*
- * @@@ AKPM: This seems rather over-defensive. We're giving commit
- * a _lot_ of headroom: 1/4 of the journal plus the size of
- * the committing transaction. Really, we only need to give it
- * committing_transaction->t_outstanding_credits plus "enough" for
- * the log control blocks.
- * Also, this test is inconsistent with the matching one in
- * jbd2_journal_extend().
- */
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
- jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- read_unlock(&journal->j_state_lock);
- write_lock(&journal->j_state_lock);
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
- __jbd2_log_wait_for_space(journal);
- write_unlock(&journal->j_state_lock);
- goto repeat;
+ sub_reserved_credits(journal, blocks);
+ handle->h_reserved = 0;
}
/* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
- handle->h_requested_credits = nblocks;
+ handle->h_requested_credits = blocks;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
- handle, nblocks,
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
- __jbd2_log_space_left(journal));
+ jbd2_log_space_left(journal));
read_unlock(&journal->j_state_lock);
+ current->journal_info = handle;
lock_map_acquire(&handle->h_lockdep_map);
jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
*
* We make sure that the transaction can guarantee at least nblocks of
* modified buffers in the log. We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
*
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
- unsigned int type, unsigned int line_no)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+ gfp_t gfp_mask, unsigned int type,
+ unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
+ if (rsv_blocks) {
+ handle_t *rsv_handle;
- current->journal_info = handle;
+ rsv_handle = new_handle(rsv_blocks);
+ if (!rsv_handle) {
+ jbd2_free_handle(handle);
+ return ERR_PTR(-ENOMEM);
+ }
+ rsv_handle->h_reserved = 1;
+ rsv_handle->h_journal = journal;
+ handle->h_rsv_handle = rsv_handle;
+ }
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
- current->journal_info = NULL;
return ERR_PTR(err);
}
handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ journal_t *journal = handle->h_journal;
+
+ WARN_ON(!handle->h_reserved);
+ sub_reserved_credits(journal, handle->h_buffer_credits);
+ jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+ unsigned int line_no)
+{
+ journal_t *journal = handle->h_journal;
+ int ret = -EIO;
+
+ if (WARN_ON(!handle->h_reserved)) {
+ /* Someone passed in normal handle? Just stop it. */
+ jbd2_journal_stop(handle);
+ return ret;
+ }
+ /*
+ * Usefulness of mixing of reserved and unreserved handles is
+ * questionable. So far nobody seems to need it so just error out.
+ */
+ if (WARN_ON(current->journal_info)) {
+ jbd2_journal_free_reserved(handle);
+ return ret;
+ }
+
+ handle->h_journal = NULL;
+ /*
+ * GFP_NOFS is here because callers are likely from writeback or
+ * similarly constrained call sites
+ */
+ ret = start_this_handle(journal, handle, GFP_NOFS);
+ if (ret < 0)
+ jbd2_journal_free_reserved(handle);
+ handle->h_type = type;
+ handle->h_line_no = line_no;
+ return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
* int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
int jbd2_journal_extend(handle_t *handle, int nblocks)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
int result;
int wanted;
- result = -EIO;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
result = 1;
read_lock(&journal->j_state_lock);
/* Don't extend a locked-down transaction! */
- if (handle->h_transaction->t_state != T_RUNNING) {
+ if (transaction->t_state != T_RUNNING) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
spin_lock(&transaction->t_handle_lock);
- wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+ wanted = atomic_add_return(nblocks,
+ &transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
- if (wanted > __jbd2_log_space_left(journal)) {
+ if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+ jbd2_log_space_left(journal)) {
jbd_debug(3, "denied handle %p %d blocks: "
"insufficient log space\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
handle->h_buffer_credits,
nblocks);
handle->h_buffer_credits += nblocks;
handle->h_requested_credits += nblocks;
- atomic_add(nblocks, &transaction->t_outstanding_credits);
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
-out:
return result;
}
@@ -490,19 +615,22 @@ out:
* to a running handle, a call to jbd2_journal_restart will commit the
* handle's transaction so far and reattach the handle to a new
* transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
*/
int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
tid_t tid;
int need_to_start, ret;
+ WARN_ON(!transaction);
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
+ journal = transaction->t_journal;
/*
* First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
spin_lock(&transaction->t_handle_lock);
atomic_sub(handle->h_buffer_credits,
&transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle) {
+ sub_reserved_credits(journal,
+ handle->h_rsv_handle->h_buffer_credits);
+ }
if (atomic_dec_and_test(&transaction->t_updates))
wake_up(&journal->j_wait_updates);
+ tid = transaction->t_tid;
spin_unlock(&transaction->t_handle_lock);
+ handle->h_transaction = NULL;
+ current->journal_info = NULL;
jbd_debug(2, "restarting handle %p\n", handle);
- tid = transaction->t_tid;
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
++journal->j_barrier_count;
+ /* Wait until there are no reserved handles */
+ if (atomic_read(&journal->j_reserved_credits)) {
+ write_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) == 0);
+ write_lock(&journal->j_state_lock);
+ }
+
/* Wait until there are no running updates */
while (1) {
transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
+static int sleep_on_shadow_bh(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int force_copy)
{
struct buffer_head *bh;
- transaction_t *transaction;
+ transaction_t *transaction = handle->h_transaction;
journal_t *journal;
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
unsigned long start_lock, time_lock;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
-
- transaction = handle->h_transaction;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
* journaled. If the primary copy is already going to
* disk then we cannot do copy-out here. */
- if (jh->b_jlist == BJ_Shadow) {
- DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
- wait_queue_head_t *wqh;
-
- wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+ if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
jbd_unlock_bh_state(bh);
- /* commit wakes up all shadow buffers after IO */
- for ( ; ; ) {
- prepare_to_wait(wqh, &wait.wait,
- TASK_UNINTERRUPTIBLE);
- if (jh->b_jlist != BJ_Shadow)
- break;
- schedule();
- }
- finish_wait(wqh, &wait.wait);
+ wait_on_bit(&bh->b_state, BH_Shadow,
+ sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
goto repeat;
}
- /* Only do the copy if the currently-owning transaction
- * still needs it. If it is on the Forget list, the
- * committing transaction is past that stage. The
- * buffer had better remain locked during the kmalloc,
- * but that should be true --- we hold the journal lock
- * still and the buffer is already on the BUF_JOURNAL
- * list so won't be flushed.
+ /*
+ * Only do the copy if the currently-owning transaction still
+ * needs it. If buffer isn't on BJ_Metadata list, the
+ * committing transaction is past that stage (here we use the
+ * fact that BH_Shadow is set under bh_state lock together with
+ * refiling to BJ_Shadow list and at this point we know the
+ * buffer doesn't have BH_Shadow set).
*
* Subtle point, though: if this is a get_undo_access,
* then we will be relying on the frozen_data to contain
* the new value of the committed_data record after the
* transaction, so we HAVE to force the frozen_data copy
- * in that case. */
-
- if (jh->b_jlist != BJ_Forget || force_copy) {
+ * in that case.
+ */
+ if (jh->b_jlist == BJ_Metadata || force_copy) {
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
jbd_debug(5, "journal_head %p\n", jh);
+ WARN_ON(!transaction);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
+ journal = transaction->t_journal;
err = 0;
JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int ret = 0;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
jh = jbd2_journal_grab_journal_head(bh);
if (!jh) {
ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
int was_modified = 0;
+ WARN_ON(!transaction);
+ if (is_handle_aborted(handle))
+ return -EROFS;
+ journal = transaction->t_journal;
+
BUFFER_TRACE(bh, "entry");
jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
*/
jh->b_modified = 0;
- if (jh->b_transaction == handle->h_transaction) {
+ if (jh->b_transaction == transaction) {
J_ASSERT_JH(jh, !jh->b_frozen_data);
/* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
int jbd2_journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int err, wait_for_commit = 0;
+ journal_t *journal;
+ int err = 0, wait_for_commit = 0;
tid_t tid;
pid_t pid;
+ if (!transaction)
+ goto free_and_exit;
+ journal = transaction->t_journal;
+
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
- else {
+ else
J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- err = 0;
- }
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
lock_map_release(&handle->h_lockdep_map);
+ if (handle->h_rsv_handle)
+ jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
jbd2_free_handle(handle);
return err;
}
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk. May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
- handle_t *handle;
- int ret;
-
- handle = jbd2_journal_start(journal, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- } else {
- handle->h_sync = 1;
- ret = jbd2_journal_stop(handle);
- }
- return ret;
-}
-
/*
*
* List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
- * of these pointers, it could go bad. Generally the caller needs to re-read
- * the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list. If the caller is holding onto a copy of one of these
+ * pointers, it could go bad. Generally the caller needs to re-read the
+ * pointer from the transaction_t.
*
* Called under j_list_lock.
*/
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
* void jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page. Can return -EBUSY
- * if buffers are part of the committing transaction and the page is straddling
- * i_size. Caller then has to wait for current commit and try again.
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
*/
int jbd2_journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
int ret = 0;
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return 0;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return 0;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- ret = journal_unmap_buffer(journal, bh, offset > 0);
+ ret = journal_unmap_buffer(journal, bh, partial_page);
unlock_buffer(bh);
if (ret < 0)
return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- return -EIO;
+ return -EROFS;
+ journal = transaction->t_journal;
jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index acd46a4160cb..e3aac222472e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,7 +22,7 @@
#include <linux/time.h>
#include "nodelist.h"
-static int jffs2_readdir (struct file *, void *, filldir_t);
+static int jffs2_readdir (struct file *, struct dir_context *);
static int jffs2_create (struct inode *,struct dentry *,umode_t,
bool);
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
const struct file_operations jffs2_dir_operations =
{
.read = generic_read_dir,
- .readdir = jffs2_readdir,
+ .iterate = jffs2_readdir,
.unlocked_ioctl=jffs2_ioctl,
.fsync = jffs2_fsync,
.llseek = generic_file_llseek,
@@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
/***********************************************************************/
-static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int jffs2_readdir(struct file *file, struct dir_context *ctx)
{
- struct jffs2_inode_info *f;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_full_dirent *fd;
- unsigned long offset, curofs;
+ unsigned long curofs = 1;
- jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
- file_inode(filp)->i_ino);
+ jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
- f = JFFS2_INODE_INFO(inode);
-
- offset = filp->f_pos;
-
- if (offset == 0) {
- jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out;
- offset++;
- }
- if (offset == 1) {
- unsigned long pino = parent_ino(filp->f_path.dentry);
- jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
- if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
- goto out;
- offset++;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
- curofs=1;
mutex_lock(&f->sem);
for (fd = f->dents; fd; fd = fd->next) {
-
curofs++;
- /* First loop: curofs = 2; offset = 2 */
- if (curofs < offset) {
+ /* First loop: curofs = 2; pos = 2 */
+ if (curofs < ctx->pos) {
jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
- fd->name, fd->ino, fd->type, curofs, offset);
+ fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
continue;
}
if (!fd->ino) {
jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
fd->name);
- offset++;
+ ctx->pos++;
continue;
}
jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
- offset, fd->name, fd->ino, fd->type);
- if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
+ (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
+ if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
break;
- offset++;
+ ctx->pos++;
}
mutex_unlock(&f->sem);
- out:
- filp->f_pos = offset;
return 0;
}
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ddbeceafc62..9f4ed13d9f15 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
* return: offset = (pn, index) of start entry
* of next jfs_readdir()/dtRead()
*/
-int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int jfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *ip = file_inode(filp);
+ struct inode *ip = file_inode(file);
struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
int rc = 0;
loff_t dtpos; /* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
int overflow, fix_page, page_fixed = 0;
static int unique_pos = 2; /* If we can't fix broken index */
- if (filp->f_pos == DIREND)
+ if (ctx->pos == DIREND)
return 0;
if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
*/
do_index = 1;
- dir_index = (u32) filp->f_pos;
+ dir_index = (u32) ctx->pos;
if (dir_index > 1) {
struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (dtEmpty(ip) ||
(dir_index >= JFS_IP(ip)->next_index)) {
/* Stale position. Directory has shrunk */
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
repeat:
rc = read_index(ip, dir_index, &dirtab_slot);
if (rc) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return rc;
}
if (dirtab_slot.flag == DIR_INDEX_FREE) {
if (loop_count++ > JFS_IP(ip)->next_index) {
jfs_err("jfs_readdir detected "
"infinite loop!");
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
dir_index = le32_to_cpu(dirtab_slot.addr2);
if (dir_index == -1) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
goto repeat;
@@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
index = dirtab_slot.slot;
DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
if (rc) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
if (p->header.flag & BT_INTERNAL) {
jfs_err("jfs_readdir: bad index table");
DT_PUTPAGE(mp);
- filp->f_pos = -1;
+ ctx->pos = -1;
return 0;
}
} else {
@@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/*
* self "."
*/
- filp->f_pos = 0;
- if (filldir(dirent, ".", 1, 0, ip->i_ino,
- DT_DIR))
+ ctx->pos = 0;
+ if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
return 0;
}
/*
* parent ".."
*/
- filp->f_pos = 1;
- if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+ ctx->pos = 1;
+ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
return 0;
/*
* Find first entry of left-most leaf
*/
if (dtEmpty(ip)) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
@@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
* pn > 0: Real entries, pn=1 -> leftmost page
* pn = index = -1: No more entries
*/
- dtpos = filp->f_pos;
+ dtpos = ctx->pos;
if (dtpos == 0) {
/* build "." entry */
-
- if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
- DT_DIR))
+ if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
return 0;
dtoffset->index = 1;
- filp->f_pos = dtpos;
+ ctx->pos = dtpos;
}
if (dtoffset->pn == 0) {
if (dtoffset->index == 1) {
/* build ".." entry */
-
- if (filldir(dirent, "..", 2, filp->f_pos,
- PARENT(ip), DT_DIR))
+ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
return 0;
} else {
jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
dtoffset->pn = 1;
dtoffset->index = 0;
- filp->f_pos = dtpos;
+ ctx->pos = dtpos;
}
if (dtEmpty(ip)) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
- if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+ if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
jfs_err("jfs_readdir: unexpected rc = %d "
"from dtReadNext", rc);
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
/* get start leaf page and index */
@@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* offset beyond directory eof ? */
if (bn < 0) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
}
@@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (dirent_buf == 0) {
DT_PUTPAGE(mp);
jfs_warn("jfs_readdir: __get_free_page failed!");
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return -ENOMEM;
}
@@ -3295,9 +3290,9 @@ skip_one:
jfs_dirent = (struct jfs_dirent *) dirent_buf;
while (jfs_dirents--) {
- filp->f_pos = jfs_dirent->position;
- if (filldir(dirent, jfs_dirent->name,
- jfs_dirent->name_len, filp->f_pos,
+ ctx->pos = jfs_dirent->position;
+ if (!dir_emit(ctx, jfs_dirent->name,
+ jfs_dirent->name_len,
jfs_dirent->ino, DT_UNKNOWN))
goto out;
jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3304,7 @@ skip_one:
}
if (!overflow && (bn == 0)) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
break;
}
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb317235..fd4169e6e698 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
ino_t * orig_ino, ino_t new_ino, int flag);
-extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+extern int jfs_readdir(struct file *file, struct dir_context *ctx);
#endif /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c57499dca89c..360d27c48887 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2009,7 +2009,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- submit_bio(READ_SYNC, bio);
+ /*check if journaling to disk has been disabled*/
+ if (log->no_integrity) {
+ bio->bi_size = 0;
+ lbmIODone(bio, 0);
+ } else {
+ submit_bio(READ_SYNC, bio);
+ }
wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..9e3aaff11f89 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- BUG_ON(offset);
+ BUG_ON(offset || length < PAGE_CACHE_SIZE);
BUG_ON(PageWriteback(page));
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7ad6086..89186b7b9002 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = {
const struct file_operations jfs_dir_operations = {
.read = generic_read_dir,
- .readdir = jfs_readdir,
+ .iterate = jfs_readdir,
.fsync = jfs_fsync,
.unlocked_ioctl = jfs_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2003e830ed1c..788e0a9c1fb0 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -611,11 +611,28 @@ static int jfs_freeze(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
struct jfs_log *log = sbi->log;
+ int rc = 0;
if (!(sb->s_flags & MS_RDONLY)) {
txQuiesce(sb);
- lmLogShutdown(log);
- updateSuper(sb, FM_CLEAN);
+ rc = lmLogShutdown(log);
+ if (rc) {
+ jfs_error(sb, "jfs_freeze: lmLogShutdown failed");
+
+ /* let operations fail rather than hang */
+ txResume(sb);
+
+ return rc;
+ }
+ rc = updateSuper(sb, FM_CLEAN);
+ if (rc) {
+ jfs_err("jfs_freeze: updateSuper failed\n");
+ /*
+ * Don't fail here. Everything succeeded except
+ * marking the superblock clean, so there's really
+ * no harm in leaving it frozen for now.
+ */
+ }
}
return 0;
}
@@ -627,13 +644,18 @@ static int jfs_unfreeze(struct super_block *sb)
int rc = 0;
if (!(sb->s_flags & MS_RDONLY)) {
- updateSuper(sb, FM_MOUNT);
- if ((rc = lmLogInit(log)))
- jfs_err("jfs_unlock failed with return code %d", rc);
- else
- txResume(sb);
+ rc = updateSuper(sb, FM_MOUNT);
+ if (rc) {
+ jfs_error(sb, "jfs_unfreeze: updateSuper failed");
+ goto out;
+ }
+ rc = lmLogInit(log);
+ if (rc)
+ jfs_error(sb, "jfs_unfreeze: lmLogInit failed");
+out:
+ txResume(sb);
}
- return 0;
+ return rc;
}
static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c4158b..c3a0837fb861 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -135,60 +135,40 @@ static inline unsigned char dt_type(struct inode *inode)
* both impossible due to the lock on directory.
*/
-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int dcache_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct dentry *cursor = filp->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct dentry *cursor = file->private_data;
struct list_head *p, *q = &cursor->d_u.d_child;
- ino_t ino;
- int i = filp->f_pos;
- switch (i) {
- case 0:
- ino = dentry->d_inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- default:
- spin_lock(&dentry->d_lock);
- if (filp->f_pos == 2)
- list_move(q, &dentry->d_subdirs);
-
- for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
- struct dentry *next;
- next = list_entry(p, struct dentry, d_u.d_child);
- spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
- if (!simple_positive(next)) {
- spin_unlock(&next->d_lock);
- continue;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+ spin_lock(&dentry->d_lock);
+ if (ctx->pos == 2)
+ list_move(q, &dentry->d_subdirs);
+
+ for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
+ struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
+ spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+ if (!simple_positive(next)) {
+ spin_unlock(&next->d_lock);
+ continue;
+ }
- spin_unlock(&next->d_lock);
- spin_unlock(&dentry->d_lock);
- if (filldir(dirent, next->d_name.name,
- next->d_name.len, filp->f_pos,
- next->d_inode->i_ino,
- dt_type(next->d_inode)) < 0)
- return 0;
- spin_lock(&dentry->d_lock);
- spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
- /* next is still alive */
- list_move(q, p);
- spin_unlock(&next->d_lock);
- p = q;
- filp->f_pos++;
- }
- spin_unlock(&dentry->d_lock);
+ spin_unlock(&next->d_lock);
+ spin_unlock(&dentry->d_lock);
+ if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
+ next->d_inode->i_ino, dt_type(next->d_inode)))
+ return 0;
+ spin_lock(&dentry->d_lock);
+ spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+ /* next is still alive */
+ list_move(q, p);
+ spin_unlock(&next->d_lock);
+ p = q;
+ ctx->pos++;
}
+ spin_unlock(&dentry->d_lock);
return 0;
}
@@ -202,7 +182,7 @@ const struct file_operations simple_dir_operations = {
.release = dcache_dir_close,
.llseek = dcache_dir_lseek,
.read = generic_read_dir,
- .readdir = dcache_readdir,
+ .iterate = dcache_readdir,
.fsync = noop_fsync,
};
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b82751082112..6bdc347008f5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
* way to combine the two copies */
-#define IMPLICIT_NODES 2
-static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+static int logfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *dir = file_inode(file);
- loff_t pos = file->f_pos - IMPLICIT_NODES;
+ loff_t pos;
struct page *page;
struct logfs_disk_dentry *dd;
- int full;
+ if (ctx->pos < 0)
+ return -EINVAL;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ pos = ctx->pos - 2;
BUG_ON(pos < 0);
- for (;; pos++) {
+ for (;; pos++, ctx->pos++) {
+ bool full;
if (beyond_eof(dir, pos))
break;
if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
dd = kmap(page);
BUG_ON(dd->namelen == 0);
- full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
- pos, be64_to_cpu(dd->ino), dd->type);
+ full = !dir_emit(ctx, (char *)dd->name,
+ be16_to_cpu(dd->namelen),
+ be64_to_cpu(dd->ino), dd->type);
kunmap(page);
page_cache_release(page);
if (full)
break;
}
-
- file->f_pos = pos + IMPLICIT_NODES;
return 0;
}
-static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
-{
- struct inode *inode = file_inode(file);
- ino_t pino = parent_ino(file->f_dentry);
- int err;
-
- if (file->f_pos < 0)
- return -EINVAL;
-
- if (file->f_pos == 0) {
- if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
- return 0;
- file->f_pos++;
- }
- if (file->f_pos == 1) {
- if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
- return 0;
- file->f_pos++;
- }
-
- err = __logfs_readdir(file, buf, filldir);
- return err;
-}
-
static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
{
dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = {
const struct file_operations logfs_dir_fops = {
.fsync = logfs_fsync,
.unlocked_ioctl = logfs_ioctl,
- .readdir = logfs_readdir,
+ .iterate = logfs_readdir,
.read = generic_read_dir,
.llseek = default_llseek,
};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
return __logfs_writepage(page);
}
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct logfs_block *block = logfs_block(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
return area;
}
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+ unsigned int l)
{
return;
}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a9ed6f36e6ea..08c442902fcd 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -16,12 +16,12 @@
typedef struct minix_dir_entry minix_dirent;
typedef struct minix3_dir_entry minix3_dirent;
-static int minix_readdir(struct file *, void *, filldir_t);
+static int minix_readdir(struct file *, struct dir_context *);
const struct file_operations minix_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = minix_readdir,
+ .iterate = minix_readdir,
.fsync = generic_file_fsync,
};
@@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
return (void*)((char*)de + sbi->s_dirsize);
}
-static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int minix_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned long pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
- unsigned long npages = dir_pages(inode);
struct minix_sb_info *sbi = minix_sb(sb);
unsigned chunk_size = sbi->s_dirsize;
- char *name;
- __u32 inumber;
+ unsigned long npages = dir_pages(inode);
+ unsigned long pos = ctx->pos;
+ unsigned offset;
+ unsigned long n;
- pos = (pos + chunk_size-1) & ~(chunk_size-1);
+ ctx->pos = pos = (pos + chunk_size-1) & ~(chunk_size-1);
if (pos >= inode->i_size)
- goto done;
+ return 0;
+
+ offset = pos & ~PAGE_CACHE_MASK;
+ n = pos >> PAGE_CACHE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
p = kaddr+offset;
limit = kaddr + minix_last_byte(inode, n) - chunk_size;
for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+ const char *name;
+ __u32 inumber;
if (sbi->s_version == MINIX_V3) {
minix3_dirent *de3 = (minix3_dirent *)p;
name = de3->name;
@@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
inumber = de->inode;
}
if (inumber) {
- int over;
-
unsigned l = strnlen(name, sbi->s_namelen);
- offset = p - kaddr;
- over = filldir(dirent, name, l,
- (n << PAGE_CACHE_SHIFT) | offset,
- inumber, DT_UNKNOWN);
- if (over) {
+ if (!dir_emit(ctx, name, l,
+ inumber, DT_UNKNOWN)) {
dir_put_page(page);
- goto done;
+ return 0;
}
}
+ ctx->pos += chunk_size;
}
dir_put_page(page);
}
-
-done:
- filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
return 0;
}
diff --git a/fs/namei.c b/fs/namei.c
index 85e40d1c0a8f..9ed9361223c0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1976,7 +1976,7 @@ static int path_lookupat(int dfd, const char *name,
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
- if (!nd->inode->i_op->lookup) {
+ if (!can_lookup(nd->inode)) {
path_put(&nd->path);
err = -ENOTDIR;
}
@@ -2850,7 +2850,7 @@ finish_lookup:
if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
goto out;
error = -ENOTDIR;
- if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
+ if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
goto out;
audit_inode(name, nd->path.dentry, 0);
finish_open:
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 816326093656..0e7f00298213 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -23,12 +23,12 @@
#include "ncp_fs.h"
-static void ncp_read_volume_list(struct file *, void *, filldir_t,
+static void ncp_read_volume_list(struct file *, struct dir_context *,
struct ncp_cache_control *);
-static void ncp_do_readdir(struct file *, void *, filldir_t,
+static void ncp_do_readdir(struct file *, struct dir_context *,
struct ncp_cache_control *);
-static int ncp_readdir(struct file *, void *, filldir_t);
+static int ncp_readdir(struct file *, struct dir_context *);
static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ncp_readdir,
+ .iterate = ncp_readdir,
.unlocked_ioctl = ncp_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ncp_compat_ioctl,
@@ -424,9 +424,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
}
-static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ncp_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct page *page = NULL;
struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (int) filp->f_pos);
+ (int) ctx->pos);
result = -EIO;
/* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +448,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto out;
result = 0;
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
- goto out;
- filp->f_pos = 1;
- }
- if (filp->f_pos == 1) {
- if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
- goto out;
- filp->f_pos = 2;
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
page = grab_cache_page(&inode->i_data, 0);
if (!page)
@@ -469,7 +461,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (!PageUptodate(page) || !ctl.head.eof)
goto init_cache;
- if (filp->f_pos == 2) {
+ if (ctx->pos == 2) {
if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
goto init_cache;
@@ -479,10 +471,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto init_cache;
}
- if (filp->f_pos > ctl.head.end)
+ if (ctx->pos > ctl.head.end)
goto finished;
- ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2);
+ ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE;
ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE;
@@ -497,21 +489,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
while (ctl.idx < NCP_DIRCACHE_SIZE) {
struct dentry *dent;
- int res;
+ bool over;
dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
- dentry, filp->f_pos);
+ dentry, ctx->pos);
if (!dent)
goto invalid_cache;
- res = filldir(dirent, dent->d_name.name,
- dent->d_name.len, filp->f_pos,
+ over = !dir_emit(ctx, dent->d_name.name,
+ dent->d_name.len,
dent->d_inode->i_ino, DT_UNKNOWN);
dput(dent);
- if (res)
+ if (over)
goto finished;
- filp->f_pos += 1;
+ ctx->pos += 1;
ctl.idx += 1;
- if (filp->f_pos > ctl.head.end)
+ if (ctx->pos > ctl.head.end)
goto finished;
}
if (ctl.page) {
@@ -548,9 +540,9 @@ init_cache:
ctl.valid = 1;
read_really:
if (ncp_is_server_root(inode)) {
- ncp_read_volume_list(filp, dirent, filldir, &ctl);
+ ncp_read_volume_list(file, ctx, &ctl);
} else {
- ncp_do_readdir(filp, dirent, filldir, &ctl);
+ ncp_do_readdir(file, ctx, &ctl);
}
ctl.head.end = ctl.fpos - 1;
ctl.head.eof = ctl.valid;
@@ -573,11 +565,11 @@ out:
}
static int
-ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ncp_fill_cache(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
int inval_childs)
{
- struct dentry *newdent, *dentry = filp->f_path.dentry;
+ struct dentry *newdent, *dentry = file->f_path.dentry;
struct inode *dir = dentry->d_inode;
struct ncp_cache_control ctl = *ctrl;
struct qstr qname;
@@ -666,15 +658,15 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
end_advance:
if (!valid)
ctl.valid = 0;
- if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
+ if (!ctl.filled && (ctl.fpos == ctx->pos)) {
if (!ino)
ino = find_inode_number(dentry, &qname);
if (!ino)
ino = iunique(dir->i_sb, 2);
- ctl.filled = filldir(dirent, qname.name, qname.len,
- filp->f_pos, ino, DT_UNKNOWN);
+ ctl.filled = !dir_emit(ctx, qname.name, qname.len,
+ ino, DT_UNKNOWN);
if (!ctl.filled)
- filp->f_pos += 1;
+ ctx->pos += 1;
}
ctl.fpos += 1;
ctl.idx += 1;
@@ -683,10 +675,10 @@ end_advance:
}
static void
-ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
+ncp_read_volume_list(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctl)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct ncp_server *server = NCP_SERVER(inode);
struct ncp_volume_info info;
@@ -694,7 +686,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
int i;
DPRINTK("ncp_read_volume_list: pos=%ld\n",
- (unsigned long) filp->f_pos);
+ (unsigned long) ctx->pos);
for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
int inval_dentry;
@@ -715,16 +707,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
}
inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
entry.volume = entry.i.volNumber;
- if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
+ if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
return;
}
}
static void
-ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
+ncp_do_readdir(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctl)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *dir = dentry->d_inode;
struct ncp_server *server = NCP_SERVER(dir);
struct nw_search_sequence seq;
@@ -736,7 +728,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (unsigned long) filp->f_pos);
+ (unsigned long) ctx->pos);
PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
dentry->d_name.name, NCP_FINFO(dir)->volNumber,
NCP_FINFO(dir)->dirEntNum);
@@ -778,7 +770,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
rpl += onerpl;
rpls -= onerpl;
entry.volume = entry.i.volNumber;
- if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
+ if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
break;
}
} while (more);
@@ -1029,15 +1021,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
DPRINTK("ncp_rmdir: removing %s/%s\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
- /*
- * fail with EBUSY if there are still references to this
- * directory.
- */
- dentry_unhash(dentry);
- error = -EBUSY;
- if (!d_unhashed(dentry))
- goto out;
-
len = sizeof(__name);
error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
dentry->d_name.len, !ncp_preserve_case(dir));
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a13d26ede254..0bc27684ebfa 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -414,7 +414,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
spin_lock(&tbl->slot_tbl_lock);
/* state manager is resetting the session */
- if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_unlock(&tbl->slot_tbl_lock);
status = htonl(NFS4ERR_DELAY);
/* Return NFS4ERR_BADSESSION if we're draining the session
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 59461c957d9d..a35582c9d444 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
* A single slot, so highest used slotid is either 0 or -1
*/
tbl->highest_used_slotid = NFS4_NO_SLOT;
- nfs4_session_drain_complete(session, tbl);
+ nfs4_slot_tbl_drain_complete(tbl);
spin_unlock(&tbl->slot_tbl_lock);
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e093e73178b7..5d051419527b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -46,7 +46,7 @@
static int nfs_opendir(struct inode *, struct file *);
static int nfs_closedir(struct inode *, struct file *);
-static int nfs_readdir(struct file *, void *, filldir_t);
+static int nfs_readdir(struct file *, struct dir_context *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +54,7 @@ static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
.read = generic_read_dir,
- .readdir = nfs_readdir,
+ .iterate = nfs_readdir,
.open = nfs_opendir,
.release = nfs_closedir,
.fsync = nfs_fsync_dir,
@@ -147,6 +147,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
typedef struct {
struct file *file;
struct page *page;
+ struct dir_context *ctx;
unsigned long page_index;
u64 *dir_cookie;
u64 last_cookie;
@@ -252,7 +253,7 @@ out:
static
int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
{
- loff_t diff = desc->file->f_pos - desc->current_index;
+ loff_t diff = desc->ctx->pos - desc->current_index;
unsigned int index;
if (diff < 0)
@@ -289,7 +290,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
|| (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->file->f_pos) {
+ } else if (new_pos < desc->ctx->pos) {
if (ctx->duped > 0
&& ctx->dup_cookie == *desc->dir_cookie) {
if (printk_ratelimit()) {
@@ -307,7 +308,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
ctx->dup_cookie = *desc->dir_cookie;
ctx->duped = -1;
}
- desc->file->f_pos = new_pos;
+ desc->ctx->pos = new_pos;
desc->cache_entry_index = i;
return 0;
}
@@ -405,13 +406,13 @@ different:
}
static
-bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
return true;
- if (filp->f_pos == 0)
+ if (ctx->pos == 0)
return true;
return false;
}
@@ -702,8 +703,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
* Once we've found the start of the dirent within a page: fill 'er up...
*/
static
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
- filldir_t filldir)
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
{
struct file *file = desc->file;
int i = 0;
@@ -721,13 +721,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
struct nfs_cache_array_entry *ent;
ent = &array->array[i];
- if (filldir(dirent, ent->string.name, ent->string.len,
- file->f_pos, nfs_compat_user_ino64(ent->ino),
- ent->d_type) < 0) {
+ if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+ nfs_compat_user_ino64(ent->ino), ent->d_type)) {
desc->eof = 1;
break;
}
- file->f_pos++;
+ desc->ctx->pos++;
if (i < (array->size-1))
*desc->dir_cookie = array->array[i+1].cookie;
else
@@ -759,8 +758,7 @@ out:
* directory in the page cache by the time we get here.
*/
static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
- filldir_t filldir)
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
{
struct page *page = NULL;
int status;
@@ -785,7 +783,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
if (status < 0)
goto out_release;
- status = nfs_do_filldir(desc, dirent, filldir);
+ status = nfs_do_filldir(desc);
out:
dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +798,36 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
last cookie cache takes care of the common case of reading the
whole directory.
*/
-static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
- struct nfs_open_dir_context *dir_ctx = filp->private_data;
+ struct nfs_open_dir_context *dir_ctx = file->private_data;
int res;
dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (long long)filp->f_pos);
+ (long long)ctx->pos);
nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
/*
- * filp->f_pos points to the dirent entry number.
+ * ctx->pos points to the dirent entry number.
* *desc->dir_cookie has the cookie for the next entry. We have
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
memset(desc, 0, sizeof(*desc));
- desc->file = filp;
+ desc->file = file;
+ desc->ctx = ctx;
desc->dir_cookie = &dir_ctx->dir_cookie;
desc->decode = NFS_PROTO(inode)->decode_dirent;
- desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
+ desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
nfs_block_sillyrename(dentry);
- res = nfs_revalidate_mapping(inode, filp->f_mapping);
+ res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -840,7 +839,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* This means either end of directory */
if (*desc->dir_cookie && desc->eof == 0) {
/* Or that the server has 'lost' a cookie */
- res = uncached_readdir(desc, dirent, filldir);
+ res = uncached_readdir(desc);
if (res == 0)
continue;
}
@@ -857,7 +856,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (res < 0)
break;
- res = nfs_do_filldir(desc, dirent, filldir);
+ res = nfs_do_filldir(desc);
if (res < 0)
break;
} while (!desc->eof);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..6b4a79f4ad1d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* - Called if either PG_private or PG_fscache is set on the page
* - Caller holds page lock
*/
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
- dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+ page, offset, length);
- if (offset != 0)
+ if (offset != 0 || length < PAGE_CACHE_SIZE)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page_file_mapping(page)->host, page);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 947b0c908aa9..4cbad5d6b276 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -203,7 +203,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
if (error == -EINVAL)
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_NULL);
+ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
if (error < 0)
goto error;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8fbc10054115..d7ba5616989c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -572,7 +572,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
task->tk_timeout = 0;
spin_lock(&tbl->slot_tbl_lock);
- if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
+ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) &&
!args->sa_privileged) {
/* The state manager will wait until the slot table is empty */
dprintk("%s session is draining\n", __func__);
@@ -1078,7 +1078,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
struct nfs4_state *state = opendata->state;
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
- int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC);
+ int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
nfs4_stateid stateid;
int ret = -EAGAIN;
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index ebda5f4a031b..c4e225e4a9af 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
tbl->highest_used_slotid = new_max;
else {
tbl->highest_used_slotid = NFS4_NO_SLOT;
- nfs4_session_drain_complete(tbl->session, tbl);
+ nfs4_slot_tbl_drain_complete(tbl);
}
}
dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
@@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
struct nfs4_slot *slot = pslot;
struct nfs4_slot_table *tbl = slot->table;
- if (nfs4_session_draining(tbl->session) && !args->sa_privileged)
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
return false;
slot->generation = tbl->generation;
args->sa_slot = slot;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 6f3cb39386d4..ff7d9f0f8a65 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -25,6 +25,10 @@ struct nfs4_slot {
};
/* Sessions */
+enum nfs4_slot_tbl_state {
+ NFS4_SLOT_TBL_DRAINING,
+};
+
#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
struct nfs4_slot_table {
struct nfs4_session *session; /* Parent session */
@@ -43,6 +47,7 @@ struct nfs4_slot_table {
unsigned long generation; /* Generation counter for
target_highest_slotid */
struct completion complete;
+ unsigned long slot_tbl_state;
};
/*
@@ -68,7 +73,6 @@ struct nfs4_session {
enum nfs4_session_state {
NFS4_SESSION_INITING,
- NFS4_SESSION_DRAINING,
};
#if defined(CONFIG_NFS_V4_1)
@@ -88,12 +92,11 @@ extern void nfs4_destroy_session(struct nfs4_session *session);
extern int nfs4_init_session(struct nfs_server *server);
extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
-extern void nfs4_session_drain_complete(struct nfs4_session *session,
- struct nfs4_slot_table *tbl);
+extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
-static inline bool nfs4_session_draining(struct nfs4_session *session)
+static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
{
- return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state);
+ return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
}
bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 300d17d85c0e..1fab140764c4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -241,7 +241,7 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
if (ses == NULL)
return;
tbl = &ses->fc_slot_table;
- if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
+ if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_lock(&tbl->slot_tbl_lock);
nfs41_wake_slot_table(tbl);
spin_unlock(&tbl->slot_tbl_lock);
@@ -251,15 +251,15 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
/*
* Signal state manager thread if session fore channel is drained
*/
-void nfs4_session_drain_complete(struct nfs4_session *session,
- struct nfs4_slot_table *tbl)
+void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
{
- if (nfs4_session_draining(session))
+ if (nfs4_slot_tbl_draining(tbl))
complete(&tbl->complete);
}
-static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
+static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
{
+ set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
spin_lock(&tbl->slot_tbl_lock);
if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
INIT_COMPLETION(tbl->complete);
@@ -275,13 +275,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
struct nfs4_session *ses = clp->cl_session;
int ret = 0;
- set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
/* back channel */
- ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+ ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
if (ret)
return ret;
/* fore channel */
- return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+ return nfs4_drain_slot_tbl(&ses->fc_slot_table);
}
static void nfs41_finish_session_reset(struct nfs_client *clp)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a366107a7331..2d7525fbcf25 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1942,6 +1942,7 @@ static int nfs23_validate_mount_data(void *options,
args->namlen = data->namlen;
args->bsize = data->bsize;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
if (data->flags & NFS_MOUNT_SECFLAVOUR)
args->auth_flavors[0] = data->pseudoflavor;
if (!args->nfs_server.hostname)
@@ -2637,6 +2638,7 @@ static int nfs4_validate_mount_data(void *options,
goto out_no_address;
args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
if (data->auth_flavourlen) {
if (data->auth_flavourlen > 1)
goto out_inval_auth;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4e9a21db867a..105a3b080d12 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -240,11 +240,16 @@ struct name_list {
struct list_head list;
};
+struct nfs4_dir_ctx {
+ struct dir_context ctx;
+ struct list_head names;
+};
+
static int
nfsd4_build_namelist(void *arg, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct list_head *names = arg;
+ struct nfs4_dir_ctx *ctx = arg;
struct name_list *entry;
if (namlen != HEXDIR_LEN - 1)
@@ -254,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
return -ENOMEM;
memcpy(entry->name, name, HEXDIR_LEN - 1);
entry->name[HEXDIR_LEN - 1] = '\0';
- list_add(&entry->list, names);
+ list_add(&entry->list, &ctx->names);
return 0;
}
@@ -263,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
{
const struct cred *original_cred;
struct dentry *dir = nn->rec_file->f_path.dentry;
- LIST_HEAD(names);
+ struct nfs4_dir_ctx ctx = {
+ .ctx.actor = nfsd4_build_namelist,
+ .names = LIST_HEAD_INIT(ctx.names)
+ };
int status;
status = nfs4_save_creds(&original_cred);
@@ -276,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
return status;
}
- status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
+ status = iterate_dir(nn->rec_file, &ctx.ctx);
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
- while (!list_empty(&names)) {
+ while (!list_empty(&ctx.names)) {
struct name_list *entry;
- entry = list_entry(names.next, struct name_list, list);
+ entry = list_entry(ctx.names.next, struct name_list, list);
if (!status) {
struct dentry *dentry;
dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 84ce601d8063..a6bc8a7423db 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1912,6 +1912,7 @@ struct buffered_dirent {
};
struct readdir_data {
+ struct dir_context ctx;
char *dirent;
size_t used;
int full;
@@ -1943,13 +1944,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
struct readdir_cd *cdp, loff_t *offsetp)
{
- struct readdir_data buf;
struct buffered_dirent *de;
int host_err;
int size;
loff_t offset;
+ struct readdir_data buf = {
+ .ctx.actor = nfsd_buffered_filldir,
+ .dirent = (void *)__get_free_page(GFP_KERNEL)
+ };
- buf.dirent = (void *)__get_free_page(GFP_KERNEL);
if (!buf.dirent)
return nfserrno(-ENOMEM);
@@ -1963,7 +1966,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
buf.used = 0;
buf.full = 0;
- host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+ host_err = iterate_dir(file, &buf.ctx);
if (buf.full)
host_err = 0;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f30b017740a7..197a63e9d102 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
-static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nilfs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
- unsigned char *types = NULL;
- int ret;
if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
- goto success;
-
- types = nilfs_filetype_table;
+ return 0;
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (IS_ERR(page)) {
nilfs_error(sb, __func__, "bad page in #%lu",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
- ret = -EIO;
- goto done;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
+ return -EIO;
}
kaddr = page_address(page);
de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (de->rec_len == 0) {
nilfs_error(sb, __func__,
"zero-length directory entry");
- ret = -EIO;
nilfs_put_page(page);
- goto done;
+ return -EIO;
}
if (de->inode) {
- int over;
- unsigned char d_type = DT_UNKNOWN;
+ unsigned char t;
- if (types && de->file_type < NILFS_FT_MAX)
- d_type = types[de->file_type];
+ if (de->file_type < NILFS_FT_MAX)
+ t = nilfs_filetype_table[de->file_type];
+ else
+ t = DT_UNKNOWN;
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
- le64_to_cpu(de->inode), d_type);
- if (over) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le64_to_cpu(de->inode), t)) {
nilfs_put_page(page);
- goto success;
+ return 0;
}
}
- filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
+ ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
}
nilfs_put_page(page);
}
-
-success:
- ret = 0;
-done:
- return ret;
+ return 0;
}
/*
@@ -678,7 +666,7 @@ not_empty:
const struct file_operations nilfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = nilfs_readdir,
+ .iterate = nilfs_readdir,
.unlocked_ioctl = nilfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 689fb608648e..bccfec8343c5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -219,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
static int nilfs_set_page_dirty(struct page *page)
{
- int ret = __set_page_dirty_buffers(page);
+ int ret = __set_page_dirty_nobuffers(page);
- if (ret) {
+ if (page_has_buffers(page)) {
struct inode *inode = page->mapping->host;
- unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+ unsigned nr_dirty = 0;
+ struct buffer_head *bh, *head;
- nilfs_set_file_dirty(inode, nr_dirty);
+ /*
+ * This page is locked by callers, and no other thread
+ * concurrently marks its buffers dirty since they are
+ * only dirtied through routines in fs/buffer.c in
+ * which call sites of mark_buffer_dirty are protected
+ * by page lock.
+ */
+ bh = head = page_buffers(page);
+ do {
+ /* Do not mark hole blocks dirty */
+ if (buffer_dirty(bh) || !buffer_mapped(bh))
+ continue;
+
+ set_buffer_dirty(bh);
+ nr_dirty++;
+ } while (bh = bh->b_this_page, bh != head);
+
+ if (nr_dirty)
+ nilfs_set_file_dirty(inode, nr_dirty);
}
return ret;
}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0);
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
ntfs_debug("Write outside i_size - truncated?");
return 0;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index aa411c3f20e9..9e38dafa3bc7 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1004,13 +1004,11 @@ dir_err_out:
/**
* ntfs_filldir - ntfs specific filldir method
* @vol: current ntfs volume
- * @fpos: position in the directory
* @ndir: ntfs inode of current directory
* @ia_page: page in which the index allocation buffer @ie is in resides
* @ie: current index entry
* @name: buffer to use for the converted name
- * @dirent: vfs filldir callback context
- * @filldir: vfs filldir callback
+ * @actor: what to feed the entries to
*
* Convert the Unicode @name to the loaded NLS and pass it to the @filldir
* callback.
@@ -1024,12 +1022,12 @@ dir_err_out:
* retake the lock if we are returning a non-zero value as ntfs_readdir()
* would need to drop the lock immediately anyway.
*/
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+static inline int ntfs_filldir(ntfs_volume *vol,
ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
- u8 *name, void *dirent, filldir_t filldir)
+ u8 *name, struct dir_context *actor)
{
unsigned long mref;
- int name_len, rc;
+ int name_len;
unsigned dt_type;
FILE_NAME_TYPE_FLAGS name_type;
@@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
if (ia_page)
unlock_page(ia_page);
ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
- "0x%lx, DT_%s.", name, name_len, fpos, mref,
+ "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
dt_type == DT_DIR ? "DIR" : "REG");
- rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+ if (!dir_emit(actor, name, name_len, mref, dt_type))
+ return 1;
/* Relock the page but not if we are aborting ->readdir. */
- if (!rc && ia_page)
+ if (ia_page)
lock_page(ia_page);
- return rc;
+ return 0;
}
/*
@@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
* removes them again after the write is complete after which it
* unlocks the page.
*/
-static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
{
s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
- loff_t fpos, i_size;
- struct inode *bmp_vi, *vdir = file_inode(filp);
+ loff_t i_size;
+ struct inode *bmp_vi, *vdir = file_inode(file);
struct super_block *sb = vdir->i_sb;
ntfs_inode *ndir = NTFS_I(vdir);
ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
u8 *kaddr, *bmp, *index_end;
ntfs_attr_search_ctx *ctx;
- fpos = filp->f_pos;
ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
- vdir->i_ino, fpos);
+ vdir->i_ino, actor->pos);
rc = err = 0;
/* Are we at end of dir yet? */
i_size = i_size_read(vdir);
- if (fpos >= i_size + vol->mft_record_size)
- goto done;
+ if (actor->pos >= i_size + vol->mft_record_size)
+ return 0;
/* Emulate . and .. for all directories. */
- if (!fpos) {
- ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
- "inode 0x%lx, DT_DIR.", vdir->i_ino);
- rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
- if (rc)
- goto done;
- fpos++;
- }
- if (fpos == 1) {
- ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
- "inode 0x%lx, DT_DIR.",
- (unsigned long)parent_ino(filp->f_path.dentry));
- rc = filldir(dirent, "..", 2, fpos,
- parent_ino(filp->f_path.dentry), DT_DIR);
- if (rc)
- goto done;
- fpos++;
- }
+ if (!dir_emit_dots(file, actor))
+ return 0;
m = NULL;
ctx = NULL;
/*
@@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto err_out;
}
/* Are we jumping straight into the index allocation attribute? */
- if (fpos >= vol->mft_record_size)
+ if (actor->pos >= vol->mft_record_size)
goto skip_index_root;
/* Get hold of the mft record for the directory. */
m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto err_out;
}
/* Get the offset into the index root attribute. */
- ir_pos = (s64)fpos;
+ ir_pos = (s64)actor->pos;
/* Find the index root attribute in the mft record. */
err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
0, ctx);
@@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (ir_pos > (u8*)ie - (u8*)ir)
continue;
/* Advance the position even if going to skip the entry. */
- fpos = (u8*)ie - (u8*)ir;
+ actor->pos = (u8*)ie - (u8*)ir;
/* Submit the name to the filldir callback. */
- rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
- filldir);
+ rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
if (rc) {
kfree(ir);
goto abort;
@@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (!NInoIndexAllocPresent(ndir))
goto EOD;
/* Advance fpos to the beginning of the index allocation. */
- fpos = vol->mft_record_size;
+ actor->pos = vol->mft_record_size;
skip_index_root:
kaddr = NULL;
prev_ia_pos = -1LL;
/* Get the offset into the index allocation attribute. */
- ia_pos = (s64)fpos - vol->mft_record_size;
+ ia_pos = (s64)actor->pos - vol->mft_record_size;
ia_mapping = vdir->i_mapping;
ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@ find_next_index_buffer:
if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
continue;
/* Advance the position even if going to skip the entry. */
- fpos = (u8*)ie - (u8*)ia +
+ actor->pos = (u8*)ie - (u8*)ia +
(sle64_to_cpu(ia->index_block_vcn) <<
ndir->itype.index.vcn_size_bits) +
vol->mft_record_size;
@@ -1419,8 +1400,7 @@ find_next_index_buffer:
* before returning, unless a non-zero value is returned in
* which case the page is left unlocked.
*/
- rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
- filldir);
+ rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
if (rc) {
/* @ia_page is already unlocked in this case. */
ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@ unm_EOD:
iput(bmp_vi);
EOD:
/* We are finished, set fpos to EOD. */
- fpos = i_size + vol->mft_record_size;
+ actor->pos = i_size + vol->mft_record_size;
abort:
kfree(name);
-done:
-#ifdef DEBUG
- if (!rc)
- ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
- else
- ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
- rc, fpos);
-#endif
- filp->f_pos = fpos;
return 0;
err_out:
if (bmp_page) {
@@ -1471,7 +1442,6 @@ iput_err_out:
if (!err)
err = -EIO;
ntfs_debug("Failed. Returning error code %i.", -err);
- filp->f_pos = fpos;
return err;
}
@@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
const struct file_operations ntfs_dir_ops = {
.llseek = generic_file_llseek, /* Seek inside directory. */
.read = generic_read_dir, /* Return -EISDIR. */
- .readdir = ntfs_readdir, /* Read directory contents. */
+ .iterate = ntfs_readdir, /* Read directory contents. */
#ifdef NTFS_RW
.fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
/*.aio_fsync = ,*/ /* Sync all outstanding async
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..79736a28d84f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
* from ext3. PageChecked() bits have been removed as OCFS2 does not
* do journalled data.
*/
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
- jbd2_journal_invalidatepage(journal, page, offset);
+ jbd2_journal_invalidatepage(journal, page, offset, length);
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f1e1aed8f638..eb760d8acd50 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1761,11 +1761,10 @@ bail:
static int ocfs2_dir_foreach_blk_id(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx)
{
- int ret, i, filldir_ret;
- unsigned long offset = *f_pos;
+ int ret, i;
+ unsigned long offset = ctx->pos;
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
@@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
di = (struct ocfs2_dinode *)di_bh->b_data;
data = &di->id2.i_data;
- while (*f_pos < i_size_read(inode)) {
-revalidate:
+ while (ctx->pos < i_size_read(inode)) {
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1802,50 +1800,31 @@ revalidate:
break;
i += le16_to_cpu(de->rec_len);
}
- *f_pos = offset = i;
+ ctx->pos = offset = i;
*f_version = inode->i_version;
}
- de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
- if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+ de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+ if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
/* On error, skip the f_pos to the end. */
- *f_pos = i_size_read(inode);
- goto out;
+ ctx->pos = i_size_read(inode);
+ break;
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- filldir_ret = filldir(priv, de->name,
- de->name_len,
- *f_pos,
- le64_to_cpu(de->inode),
- d_type);
- if (filldir_ret) {
- if (filldir_err)
- *filldir_err = filldir_ret;
- break;
- }
- if (version != *f_version)
- goto revalidate;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le64_to_cpu(de->inode), d_type))
+ goto out;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
-
out:
brelse(di_bh);
-
return 0;
}
@@ -1855,27 +1834,26 @@ out:
*/
static int ocfs2_dir_foreach_blk_el(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
- int error = 0;
unsigned long offset, blk, last_ra_blk = 0;
- int i, stored;
+ int i;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
struct super_block * sb = inode->i_sb;
unsigned int ra_sectors = 16;
+ int stored = 0;
- stored = 0;
bh = NULL;
- offset = (*f_pos) & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && *f_pos < i_size_read(inode)) {
- blk = (*f_pos) >> sb->s_blocksize_bits;
+ while (ctx->pos < i_size_read(inode)) {
+ blk = ctx->pos >> sb->s_blocksize_bits;
if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
/* Skip the corrupt dirblock and keep trying */
- *f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
@@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
ra_sectors = 8;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1917,93 +1894,64 @@ revalidate:
i += le16_to_cpu(de->rec_len);
}
offset = i;
- *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
*f_version = inode->i_version;
}
- while (!error && *f_pos < i_size_read(inode)
+ while (ctx->pos < i_size_read(inode)
&& offset < sb->s_blocksize) {
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
/* On error, skip the f_pos to the
next block. */
- *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
brelse(bh);
- goto out;
+ continue;
}
- offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- unsigned long version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- error = filldir(priv, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- *f_pos,
le64_to_cpu(de->inode),
- d_type);
- if (error) {
- if (filldir_err)
- *filldir_err = error;
- break;
+ d_type)) {
+ brelse(bh);
+ return 0;
}
- if (version != *f_version)
- goto revalidate;
- stored ++;
+ stored++;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ offset += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
offset = 0;
brelse(bh);
bh = NULL;
+ if (!persist && stored)
+ break;
}
-
- stored = 0;
-out:
- return stored;
+ return 0;
}
static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
- loff_t *f_pos, void *priv, filldir_t filldir,
- int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
- filldir, filldir_err);
-
- return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
- filldir_err);
+ return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+ return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
}
/*
* This is intended to be called from inside other kernel functions,
* so we fake some arguments.
*/
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir)
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
{
- int ret = 0, filldir_err = 0;
u64 version = inode->i_version;
-
- while (*f_pos < i_size_read(inode)) {
- ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
- filldir, &filldir_err);
- if (ret || filldir_err)
- break;
- }
-
- if (ret > 0)
- ret = -EIO;
-
+ ocfs2_dir_foreach_blk(inode, &version, ctx, true);
return 0;
}
@@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
* ocfs2_readdir()
*
*/
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
{
int error = 0;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
int lock_level = 0;
trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
@@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
goto bail_nolock;
}
- error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
- dirent, filldir, NULL);
+ error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
ocfs2_inode_unlock(inode, lock_level);
if (error)
@@ -2120,6 +2067,7 @@ bail:
}
struct ocfs2_empty_dir_priv {
+ struct dir_context ctx;
unsigned seen_dot;
unsigned seen_dot_dot;
unsigned seen_other;
@@ -2204,8 +2152,9 @@ out:
int ocfs2_empty_dir(struct inode *inode)
{
int ret;
- loff_t start = 0;
- struct ocfs2_empty_dir_priv priv;
+ struct ocfs2_empty_dir_priv priv = {
+ .ctx.actor = ocfs2_empty_dir_filldir
+ };
memset(&priv, 0, sizeof(priv));
@@ -2219,7 +2168,7 @@ int ocfs2_empty_dir(struct inode *inode)
*/
}
- ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+ ret = ocfs2_dir_foreach(inode, &priv.ctx);
if (ret)
mlog_errno(ret);
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb645..f0344b75b14d 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
struct ocfs2_dir_lookup_result *res);
int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b3fdd1a323d6..e68588e6b1e8 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1408,6 +1408,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 1c39efb71bab..2487116d0d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -790,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
&hole_size, &rec, &is_last);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_unlock;
}
if (rec.e_blkno == 0ULL) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8a7509f9e6f5..8a38714f1d92 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2288,7 +2288,7 @@ relock:
ret = ocfs2_inode_lock(inode, NULL, 1);
if (ret < 0) {
mlog_errno(ret);
- goto out_sems;
+ goto out;
}
ocfs2_inode_unlock(inode, 1);
@@ -2712,7 +2712,7 @@ const struct file_operations ocfs2_fops = {
const struct file_operations ocfs2_dops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
@@ -2759,7 +2759,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
const struct file_operations ocfs2_dops_no_plocks = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8eccfabcd12e..242170d83971 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
}
struct ocfs2_orphan_filldir_priv {
+ struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
};
@@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
{
int status;
struct inode *orphan_dir_inode = NULL;
- struct ocfs2_orphan_filldir_priv priv;
- loff_t pos = 0;
-
- priv.osb = osb;
- priv.head = *head;
+ struct ocfs2_orphan_filldir_priv priv = {
+ .ctx.actor = ocfs2_orphan_filldir,
+ .osb = osb,
+ .head = *head
+ };
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
@@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
- ocfs2_orphan_filldir);
+ status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
if (status) {
mlog_errno(status);
goto out_cluster;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 04ee1b57c243..b4a5cdf9dbc5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -947,7 +947,7 @@ leave:
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_free_dir_lookup_result(&lookup);
- if (status)
+ if (status && (status != -ENOTEMPTY))
mlog_errno(status);
return status;
@@ -2216,7 +2216,7 @@ out:
brelse(orphan_dir_bh);
- return 0;
+ return ret;
}
int ocfs2_create_inode_in_orphan(struct inode *dir,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index acbaebcad3a8..1b8e9e8405b2 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
return is_bad;
}
-static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
u64 fsblock, int hindex)
{
- struct inode *dir = file_inode(filp);
- struct buffer_head *bh;
- struct omfs_inode *oi;
- u64 self;
- int res = 0;
- unsigned char d_type;
-
/* follow chain in this bucket */
while (fsblock != ~0) {
- bh = omfs_bread(dir->i_sb, fsblock);
+ struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
+ struct omfs_inode *oi;
+ u64 self;
+ unsigned char d_type;
+
if (!bh)
- goto out;
+ return true;
oi = (struct omfs_inode *) bh->b_data;
if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
brelse(bh);
- goto out;
+ return true;
}
self = fsblock;
@@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
- res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
- OMFS_NAMELEN), filp->f_pos, self, d_type);
+ if (!dir_emit(ctx, oi->i_name,
+ strnlen(oi->i_name, OMFS_NAMELEN),
+ self, d_type)) {
+ brelse(bh);
+ return false;
+ }
brelse(bh);
- if (res < 0)
- break;
- filp->f_pos++;
+ ctx->pos++;
}
-out:
- return res;
+ return true;
}
static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@ out:
return err;
}
-static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int omfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *dir = file_inode(filp);
+ struct inode *dir = file_inode(file);
struct buffer_head *bh;
- loff_t offset, res;
+ __be64 *p;
unsigned int hchain, hindex;
int nbuckets;
- u64 fsblock;
- int ret = -EINVAL;
-
- if (filp->f_pos >> 32)
- goto success;
-
- switch ((unsigned long) filp->f_pos) {
- case 0:
- if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
- goto success;
- filp->f_pos++;
- /* fall through */
- case 1:
- if (filldir(dirent, "..", 2, 1,
- parent_ino(filp->f_dentry), DT_DIR) < 0)
- goto success;
- filp->f_pos = 1 << 20;
- /* fall through */
+
+ if (ctx->pos >> 32)
+ return -EINVAL;
+
+ if (ctx->pos < 1 << 20) {
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+ ctx->pos = 1 << 20;
}
nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
/* high 12 bits store bucket + 1 and low 20 bits store hash index */
- hchain = (filp->f_pos >> 20) - 1;
- hindex = filp->f_pos & 0xfffff;
+ hchain = (ctx->pos >> 20) - 1;
+ hindex = ctx->pos & 0xfffff;
bh = omfs_bread(dir->i_sb, dir->i_ino);
if (!bh)
- goto out;
+ return -EINVAL;
- offset = OMFS_DIR_START + hchain * 8;
+ p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
- for (; hchain < nbuckets; hchain++, offset += 8) {
- fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
-
- res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
- hindex = 0;
- if (res < 0)
+ for (; hchain < nbuckets; hchain++) {
+ __u64 fsblock = be64_to_cpu(*p++);
+ if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
break;
-
- filp->f_pos = (hchain+2) << 20;
+ hindex = 0;
+ ctx->pos = (hchain+2) << 20;
}
brelse(bh);
-success:
- ret = 0;
-out:
- return ret;
+ return 0;
}
const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
const struct file_operations omfs_dir_operations = {
.read = generic_read_dir,
- .readdir = omfs_readdir,
+ .iterate = omfs_readdir,
.llseek = generic_file_llseek,
};
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 75885ffde44e..8c0ceb8dd1f7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = {
.release = seq_release,
};
-static int openpromfs_readdir(struct file *, void *, filldir_t);
+static int openpromfs_readdir(struct file *, struct dir_context *);
static const struct file_operations openprom_operations = {
.read = generic_read_dir,
- .readdir = openpromfs_readdir,
+ .iterate = openpromfs_readdir,
.llseek = generic_file_llseek,
};
@@ -260,71 +260,64 @@ found:
return NULL;
}
-static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct op_inode_info *oi = OP_I(inode);
struct device_node *dp = oi->u.node;
struct device_node *child;
struct property *prop;
- unsigned int ino;
int i;
mutex_lock(&op_mutex);
- ino = inode->i_ino;
- i = filp->f_pos;
- switch (i) {
- case 0:
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+ if (ctx->pos == 0) {
+ if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
goto out;
- i++;
- filp->f_pos++;
- /* fall thru */
- case 1:
- if (filldir(dirent, "..", 2, i,
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit(ctx, "..", 2,
(dp->parent == NULL ?
OPENPROM_ROOT_INO :
- dp->parent->unique_id), DT_DIR) < 0)
+ dp->parent->unique_id), DT_DIR))
goto out;
- i++;
- filp->f_pos++;
- /* fall thru */
- default:
- i -= 2;
-
- /* First, the children nodes as directories. */
- child = dp->child;
- while (i && child) {
- child = child->sibling;
- i--;
- }
- while (child) {
- if (filldir(dirent,
- child->path_component_name,
- strlen(child->path_component_name),
- filp->f_pos, child->unique_id, DT_DIR) < 0)
- goto out;
-
- filp->f_pos++;
- child = child->sibling;
- }
+ ctx->pos = 2;
+ }
+ i = ctx->pos - 2;
- /* Next, the properties as files. */
- prop = dp->properties;
- while (i && prop) {
- prop = prop->next;
- i--;
- }
- while (prop) {
- if (filldir(dirent, prop->name, strlen(prop->name),
- filp->f_pos, prop->unique_id, DT_REG) < 0)
- goto out;
+ /* First, the children nodes as directories. */
+ child = dp->child;
+ while (i && child) {
+ child = child->sibling;
+ i--;
+ }
+ while (child) {
+ if (!dir_emit(ctx,
+ child->path_component_name,
+ strlen(child->path_component_name),
+ child->unique_id, DT_DIR))
+ goto out;
- filp->f_pos++;
- prop = prop->next;
- }
+ ctx->pos++;
+ child = child->sibling;
+ }
+
+ /* Next, the properties as files. */
+ prop = dp->properties;
+ while (i && prop) {
+ prop = prop->next;
+ i--;
}
+ while (prop) {
+ if (!dir_emit(ctx, prop->name, strlen(prop->name),
+ prop->unique_id, DT_REG))
+ goto out;
+
+ ctx->pos++;
+ prop = prop->next;
+ }
+
out:
mutex_unlock(&op_mutex);
return 0;
diff --git a/fs/pnode.c b/fs/pnode.c
index 3d2a7141b87a..9af0df15256e 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt)
if (peer_mnt == mnt)
peer_mnt = NULL;
}
- if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
+ if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) &&
+ list_empty(&mnt->mnt_share))
mnt_release_group_id(mnt);
list_del_init(&mnt->mnt_share);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd51e50001fe..0016350ad95e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1681,11 +1681,11 @@ const struct dentry_operations pid_dentry_operations =
* reported by readdir in sync with the inode numbers reported
* by stat.
*/
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
const char *name, int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
- struct dentry *child, *dir = filp->f_path.dentry;
+ struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
struct qstr qname;
ino_t ino = 0;
@@ -1720,7 +1720,7 @@ end_instantiate:
ino = find_inode_number(dir, &qname);
if (!ino)
ino = 1;
- return filldir(dirent, name, len, filp->f_pos, ino, type);
+ return dir_emit(ctx, name, len, ino, type);
}
#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1931,14 +1931,15 @@ static const struct inode_operations proc_map_files_inode_operations = {
};
static int
-proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
struct vm_area_struct *vma;
struct task_struct *task;
struct mm_struct *mm;
- ino_t ino;
+ unsigned long nr_files, pos, i;
+ struct flex_array *fa = NULL;
+ struct map_files_info info;
+ struct map_files_info *p;
int ret;
ret = -EPERM;
@@ -1946,7 +1947,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto out;
ret = -ENOENT;
- task = get_proc_task(inode);
+ task = get_proc_task(file_inode(file));
if (!task)
goto out;
@@ -1955,91 +1956,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto out_put_task;
ret = 0;
- switch (filp->f_pos) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
- goto out_put_task;
- filp->f_pos++;
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out_put_task;
- filp->f_pos++;
- default:
- {
- unsigned long nr_files, pos, i;
- struct flex_array *fa = NULL;
- struct map_files_info info;
- struct map_files_info *p;
-
- mm = get_task_mm(task);
- if (!mm)
- goto out_put_task;
- down_read(&mm->mmap_sem);
+ if (!dir_emit_dots(file, ctx))
+ goto out_put_task;
- nr_files = 0;
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+ down_read(&mm->mmap_sem);
- /*
- * We need two passes here:
- *
- * 1) Collect vmas of mapped files with mmap_sem taken
- * 2) Release mmap_sem and instantiate entries
- *
- * otherwise we get lockdep complained, since filldir()
- * routine might require mmap_sem taken in might_fault().
- */
+ nr_files = 0;
- for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
- if (vma->vm_file && ++pos > filp->f_pos)
- nr_files++;
- }
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_sem taken
+ * 2) Release mmap_sem and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_sem taken in might_fault().
+ */
- if (nr_files) {
- fa = flex_array_alloc(sizeof(info), nr_files,
- GFP_KERNEL);
- if (!fa || flex_array_prealloc(fa, 0, nr_files,
- GFP_KERNEL)) {
- ret = -ENOMEM;
- if (fa)
- flex_array_free(fa);
- up_read(&mm->mmap_sem);
- mmput(mm);
- goto out_put_task;
- }
- for (i = 0, vma = mm->mmap, pos = 2; vma;
- vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (++pos <= filp->f_pos)
- continue;
-
- info.mode = vma->vm_file->f_mode;
- info.len = snprintf(info.name,
- sizeof(info.name), "%lx-%lx",
- vma->vm_start, vma->vm_end);
- if (flex_array_put(fa, i++, &info, GFP_KERNEL))
- BUG();
- }
+ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (vma->vm_file && ++pos > ctx->pos)
+ nr_files++;
+ }
+
+ if (nr_files) {
+ fa = flex_array_alloc(sizeof(info), nr_files,
+ GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, nr_files,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa)
+ flex_array_free(fa);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ goto out_put_task;
}
- up_read(&mm->mmap_sem);
-
- for (i = 0; i < nr_files; i++) {
- p = flex_array_get(fa, i);
- ret = proc_fill_cache(filp, dirent, filldir,
- p->name, p->len,
- proc_map_files_instantiate,
- task,
- (void *)(unsigned long)p->mode);
- if (ret)
- break;
- filp->f_pos++;
+ for (i = 0, vma = mm->mmap, pos = 2; vma;
+ vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
+
+ info.mode = vma->vm_file->f_mode;
+ info.len = snprintf(info.name,
+ sizeof(info.name), "%lx-%lx",
+ vma->vm_start, vma->vm_end);
+ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+ BUG();
}
- if (fa)
- flex_array_free(fa);
- mmput(mm);
}
+ up_read(&mm->mmap_sem);
+
+ for (i = 0; i < nr_files; i++) {
+ p = flex_array_get(fa, i);
+ if (!proc_fill_cache(file, ctx,
+ p->name, p->len,
+ proc_map_files_instantiate,
+ task,
+ (void *)(unsigned long)p->mode))
+ break;
+ ctx->pos++;
}
+ if (fa)
+ flex_array_free(fa);
+ mmput(mm);
out_put_task:
put_task_struct(task);
@@ -2049,7 +2032,7 @@ out:
static const struct file_operations proc_map_files_operations = {
.read = generic_read_dir,
- .readdir = proc_map_files_readdir,
+ .iterate = proc_map_files_readdir,
.llseek = default_llseek,
};
@@ -2118,6 +2101,7 @@ static int show_timer(struct seq_file *m, void *v)
nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
+ seq_printf(m, "ClockID: %d\n", timer->it_clock);
return 0;
}
@@ -2216,67 +2200,30 @@ out_no_task:
return error;
}
-static int proc_pident_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
- return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
- proc_pident_instantiate, task, p);
-}
-
-static int proc_pident_readdir(struct file *filp,
- void *dirent, filldir_t filldir,
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
const struct pid_entry *ents, unsigned int nents)
{
- int i;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- const struct pid_entry *p, *last;
- ino_t ino;
- int ret;
+ struct task_struct *task = get_proc_task(file_inode(file));
+ const struct pid_entry *p;
- ret = -ENOENT;
if (!task)
- goto out_no_task;
+ return -ENOENT;
- ret = 0;
- i = filp->f_pos;
- switch (i) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- i -= 2;
- if (i >= nents) {
- ret = 1;
- goto out;
- }
- p = ents + i;
- last = &ents[nents - 1];
- while (p <= last) {
- if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
- goto out;
- filp->f_pos++;
- p++;
- }
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+
+ if (ctx->pos >= nents + 2)
+ goto out;
- ret = 1;
+ for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+ if (!proc_fill_cache(file, ctx, p->name, p->len,
+ proc_pident_instantiate, task, p))
+ break;
+ ctx->pos++;
+ }
out:
put_task_struct(task);
-out_no_task:
- return ret;
+ return 0;
}
#ifdef CONFIG_SECURITY
@@ -2361,16 +2308,15 @@ static const struct pid_entry attr_dir_stuff[] = {
REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
};
-static int proc_attr_dir_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+ return proc_pident_readdir(file, ctx,
+ attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
}
static const struct file_operations proc_attr_dir_operations = {
.read = generic_read_dir,
- .readdir = proc_attr_dir_readdir,
+ .iterate = proc_attr_dir_readdir,
.llseek = default_llseek,
};
@@ -2724,16 +2670,15 @@ static const struct pid_entry tgid_base_stuff[] = {
#endif
};
-static int proc_tgid_base_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+ return proc_pident_readdir(file, ctx,
+ tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
static const struct file_operations proc_tgid_base_operations = {
.read = generic_read_dir,
- .readdir = proc_tgid_base_readdir,
+ .iterate = proc_tgid_base_readdir,
.llseek = default_llseek,
};
@@ -2935,58 +2880,42 @@ retry:
#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
-static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- struct tgid_iter iter)
-{
- char name[PROC_NUMBUF];
- int len = snprintf(name, sizeof(name), "%d", iter.tgid);
- return proc_fill_cache(filp, dirent, filldir, name, len,
- proc_pid_instantiate, iter.task, NULL);
-}
-
-static int fake_filldir(void *buf, const char *name, int namelen,
- loff_t offset, u64 ino, unsigned d_type)
-{
- return 0;
-}
-
/* for the /proc/ directory itself, after non-process stuff has been done */
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
struct pid_namespace *ns;
- filldir_t __filldir;
- loff_t pos = filp->f_pos;
+ loff_t pos = ctx->pos;
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
- goto out;
+ return 0;
if (pos == TGID_OFFSET - 1) {
- if (proc_fill_cache(filp, dirent, filldir, "self", 4,
- NULL, NULL, NULL) < 0)
- goto out;
+ if (!proc_fill_cache(file, ctx, "self", 4, NULL, NULL, NULL))
+ return 0;
iter.tgid = 0;
} else {
iter.tgid = pos - TGID_OFFSET;
}
iter.task = NULL;
- ns = filp->f_dentry->d_sb->s_fs_info;
+ ns = file->f_dentry->d_sb->s_fs_info;
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
- if (has_pid_permissions(ns, iter.task, 2))
- __filldir = filldir;
- else
- __filldir = fake_filldir;
+ char name[PROC_NUMBUF];
+ int len;
+ if (!has_pid_permissions(ns, iter.task, 2))
+ continue;
- filp->f_pos = iter.tgid + TGID_OFFSET;
- if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
+ len = snprintf(name, sizeof(name), "%d", iter.tgid);
+ ctx->pos = iter.tgid + TGID_OFFSET;
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_pid_instantiate, iter.task, NULL)) {
put_task_struct(iter.task);
- goto out;
+ return 0;
}
}
- filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
-out:
+ ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
return 0;
}
@@ -3074,11 +3003,10 @@ static const struct pid_entry tid_base_stuff[] = {
#endif
};
-static int proc_tid_base_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+ return proc_pident_readdir(file, ctx,
+ tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}
static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3089,7 +3017,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
static const struct file_operations proc_tid_base_operations = {
.read = generic_read_dir,
- .readdir = proc_tid_base_readdir,
+ .iterate = proc_tid_base_readdir,
.llseek = default_llseek,
};
@@ -3230,30 +3158,16 @@ static struct task_struct *next_tid(struct task_struct *start)
return pos;
}
-static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- struct task_struct *task, int tid)
-{
- char name[PROC_NUMBUF];
- int len = snprintf(name, sizeof(name), "%d", tid);
- return proc_fill_cache(filp, dirent, filldir, name, len,
- proc_task_instantiate, task, NULL);
-}
-
/* for the /proc/TGID/task/ directories */
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
struct task_struct *leader = NULL;
- struct task_struct *task;
- int retval = -ENOENT;
- ino_t ino;
- int tid;
+ struct task_struct *task = get_proc_task(file_inode(file));
struct pid_namespace *ns;
+ int tid;
- task = get_proc_task(inode);
if (!task)
- goto out_no_task;
+ return -ENOENT;
rcu_read_lock();
if (pid_alive(task)) {
leader = task->group_leader;
@@ -3262,46 +3176,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
rcu_read_unlock();
put_task_struct(task);
if (!leader)
- goto out_no_task;
- retval = 0;
+ return -ENOENT;
- switch ((unsigned long)filp->f_pos) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- /* fall through */
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- ns = filp->f_dentry->d_sb->s_fs_info;
- tid = (int)filp->f_version;
- filp->f_version = 0;
- for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+ ns = file->f_dentry->d_sb->s_fs_info;
+ tid = (int)file->f_version;
+ file->f_version = 0;
+ for (task = first_tid(leader, tid, ctx->pos - 2, ns);
task;
- task = next_tid(task), filp->f_pos++) {
+ task = next_tid(task), ctx->pos++) {
+ char name[PROC_NUMBUF];
+ int len;
tid = task_pid_nr_ns(task, ns);
- if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+ len = snprintf(name, sizeof(name), "%d", tid);
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- filp->f_version = (u64)tid;
+ file->f_version = (u64)tid;
put_task_struct(task);
break;
}
}
out:
put_task_struct(leader);
-out_no_task:
- return retval;
+ return 0;
}
static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3327,6 +3231,6 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
- .readdir = proc_task_readdir,
+ .iterate = proc_task_readdir,
.llseek = default_llseek,
};
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef630..1441f143c43b 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -219,74 +219,58 @@ out_no_task:
return result;
}
-static int proc_readfd_common(struct file * filp, void * dirent,
- filldir_t filldir, instantiate_t instantiate)
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+ instantiate_t instantiate)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *p = get_proc_task(inode);
+ struct task_struct *p = get_proc_task(file_inode(file));
struct files_struct *files;
- unsigned int fd, ino;
- int retval;
+ unsigned int fd;
- retval = -ENOENT;
if (!p)
- goto out_no_task;
- retval = 0;
-
- fd = filp->f_pos;
- switch (fd) {
- case 0:
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- default:
- files = get_files_struct(p);
- if (!files)
- goto out;
- rcu_read_lock();
- for (fd = filp->f_pos - 2;
- fd < files_fdtable(files)->max_fds;
- fd++, filp->f_pos++) {
- char name[PROC_NUMBUF];
- int len;
- int rv;
-
- if (!fcheck_files(files, fd))
- continue;
- rcu_read_unlock();
+ return -ENOENT;
- len = snprintf(name, sizeof(name), "%d", fd);
- rv = proc_fill_cache(filp, dirent, filldir,
- name, len, instantiate, p,
- (void *)(unsigned long)fd);
- if (rv < 0)
- goto out_fd_loop;
- rcu_read_lock();
- }
- rcu_read_unlock();
-out_fd_loop:
- put_files_struct(files);
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ files = get_files_struct(p);
+ if (!files)
+ goto out;
+
+ rcu_read_lock();
+ for (fd = ctx->pos - 2;
+ fd < files_fdtable(files)->max_fds;
+ fd++, ctx->pos++) {
+ char name[PROC_NUMBUF];
+ int len;
+
+ if (!fcheck_files(files, fd))
+ continue;
+ rcu_read_unlock();
+
+ len = snprintf(name, sizeof(name), "%d", fd);
+ if (!proc_fill_cache(file, ctx,
+ name, len, instantiate, p,
+ (void *)(unsigned long)fd))
+ goto out_fd_loop;
+ rcu_read_lock();
}
+ rcu_read_unlock();
+out_fd_loop:
+ put_files_struct(files);
out:
put_task_struct(p);
-out_no_task:
- return retval;
+ return 0;
}
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfd(struct file *file, struct dir_context *ctx)
{
- return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+ return proc_readfd_common(file, ctx, proc_fd_instantiate);
}
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .readdir = proc_readfd,
+ .iterate = proc_readfd,
.llseek = default_llseek,
};
@@ -351,9 +335,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
}
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
{
- return proc_readfd_common(filp, dirent, filldir,
+ return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
}
@@ -364,6 +348,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
const struct file_operations proc_fdinfo_operations = {
.read = generic_read_dir,
- .readdir = proc_readfdinfo,
+ .iterate = proc_readfdinfo,
.llseek = default_llseek,
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a2596afffae6..94441a407337 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -233,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
* value of the readdir() call, as long as it's non-negative
* for success..
*/
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
- filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
+ struct dir_context *ctx)
{
- unsigned int ino;
int i;
- struct inode *inode = file_inode(filp);
- int ret = 0;
-
- ino = inode->i_ino;
- i = filp->f_pos;
- switch (i) {
- case 0:
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- if (filldir(dirent, "..", 2, i,
- parent_ino(filp->f_path.dentry),
- DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- spin_lock(&proc_subdir_lock);
- de = de->subdir;
- i -= 2;
- for (;;) {
- if (!de) {
- ret = 1;
- spin_unlock(&proc_subdir_lock);
- goto out;
- }
- if (!i)
- break;
- de = de->next;
- i--;
- }
- do {
- struct proc_dir_entry *next;
-
- /* filldir passes info to user space */
- pde_get(de);
- spin_unlock(&proc_subdir_lock);
- if (filldir(dirent, de->name, de->namelen, filp->f_pos,
- de->low_ino, de->mode >> 12) < 0) {
- pde_put(de);
- goto out;
- }
- spin_lock(&proc_subdir_lock);
- filp->f_pos++;
- next = de->next;
- pde_put(de);
- de = next;
- } while (de);
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ spin_lock(&proc_subdir_lock);
+ de = de->subdir;
+ i = ctx->pos - 2;
+ for (;;) {
+ if (!de) {
spin_unlock(&proc_subdir_lock);
+ return 0;
+ }
+ if (!i)
+ break;
+ de = de->next;
+ i--;
}
- ret = 1;
-out:
- return ret;
+
+ do {
+ struct proc_dir_entry *next;
+ pde_get(de);
+ spin_unlock(&proc_subdir_lock);
+ if (!dir_emit(ctx, de->name, de->namelen,
+ de->low_ino, de->mode >> 12)) {
+ pde_put(de);
+ return 0;
+ }
+ spin_lock(&proc_subdir_lock);
+ ctx->pos++;
+ next = de->next;
+ pde_put(de);
+ de = next;
+ } while (de);
+ spin_unlock(&proc_subdir_lock);
+ return 0;
}
-int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int proc_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
- return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+ return proc_readdir_de(PDE(inode), file, ctx);
}
/*
@@ -313,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
static const struct file_operations proc_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = proc_readdir,
+ .iterate = proc_readdir,
};
/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb098b6a..4eae2e149f31 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -165,14 +165,14 @@ extern int proc_setattr(struct dentry *, struct iattr *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
extern int pid_revalidate(struct dentry *, unsigned int);
extern int pid_delete_dentry(const struct dentry *);
-extern int proc_pid_readdir(struct file *, void *, filldir_t);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
struct task_struct *, const void *);
-extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int,
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
instantiate_t, struct task_struct *, const void *);
/*
@@ -183,8 +183,8 @@ extern spinlock_t proc_subdir_lock;
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
struct dentry *);
-extern int proc_readdir(struct file *, void *, filldir_t);
-extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t);
+extern int proc_readdir(struct file *, struct dir_context *);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
{
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bd4b5a740ff1..bdfabdaefdce 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait;
static int kmsg_open(struct inode * inode, struct file * file)
{
- return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
+ return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
}
static int kmsg_release(struct inode * inode, struct file * file)
{
- (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
+ (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
return 0;
}
@@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
if ((file->f_flags & O_NONBLOCK) &&
- !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return -EAGAIN;
- return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
+ return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
}
static unsigned int kmsg_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &log_wait, wait);
- if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return POLLIN | POLLRDNORM;
return 0;
}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 54bdc6701e9f..f6abbbbfad8a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -213,74 +213,36 @@ out:
return error;
}
-static int proc_ns_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct task_struct *task,
- const struct proc_ns_operations *ops)
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_fill_cache(filp, dirent, filldir,
- ops->name, strlen(ops->name),
- proc_ns_instantiate, task, ops);
-}
-
-static int proc_ns_dir_readdir(struct file *filp, void *dirent,
- filldir_t filldir)
-{
- int i;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
const struct proc_ns_operations **entry, **last;
- ino_t ino;
- int ret;
- ret = -ENOENT;
if (!task)
- goto out_no_task;
+ return -ENOENT;
- ret = 0;
- i = filp->f_pos;
- switch (i) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- i -= 2;
- if (i >= ARRAY_SIZE(ns_entries)) {
- ret = 1;
- goto out;
- }
- entry = ns_entries + i;
- last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
- while (entry <= last) {
- if (proc_ns_fill_cache(filp, dirent, filldir,
- task, *entry) < 0)
- goto out;
- filp->f_pos++;
- entry++;
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
+ goto out;
+ entry = ns_entries + (ctx->pos - 2);
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ while (entry <= last) {
+ const struct proc_ns_operations *ops = *entry;
+ if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
+ proc_ns_instantiate, task, ops))
+ break;
+ ctx->pos++;
+ entry++;
}
-
- ret = 1;
out:
put_task_struct(task);
-out_no_task:
- return ret;
+ return 0;
}
const struct file_operations proc_ns_dir_operations = {
.read = generic_read_dir,
- .readdir = proc_ns_dir_readdir,
+ .iterate = proc_ns_dir_readdir,
};
static struct dentry *proc_ns_dir_lookup(struct inode *dir,
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 986e83220d56..4677bb7dc7c2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -160,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
.getattr = proc_tgid_net_getattr,
};
-static int proc_tgid_net_readdir(struct file *filp, void *dirent,
- filldir_t filldir)
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
{
int ret;
struct net *net;
ret = -EINVAL;
- net = get_proc_task_net(file_inode(filp));
+ net = get_proc_task_net(file_inode(file));
if (net != NULL) {
- ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+ ret = proc_readdir_de(net->proc_net, file, ctx);
put_net(net);
}
return ret;
@@ -178,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
const struct file_operations proc_net_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = proc_tgid_net_readdir,
+ .iterate = proc_tgid_net_readdir,
};
static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33a0dde..f3a570e7c257 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -573,12 +573,12 @@ out:
return ret;
}
-static int proc_sys_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir,
+static bool proc_sys_fill_cache(struct file *file,
+ struct dir_context *ctx,
struct ctl_table_header *head,
struct ctl_table *table)
{
- struct dentry *child, *dir = filp->f_path.dentry;
+ struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
struct qstr qname;
ino_t ino = 0;
@@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
inode = proc_sys_make_inode(dir->d_sb, head, table);
if (!inode) {
dput(child);
- return -ENOMEM;
+ return false;
} else {
d_set_d_op(child, &proc_sys_dentry_operations);
d_add(child, inode);
}
} else {
- return -ENOMEM;
+ return false;
}
}
inode = child->d_inode;
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
- return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+ return dir_emit(ctx, qname.name, qname.len, ino, type);
}
-static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir,
+static bool proc_sys_link_fill_cache(struct file *file,
+ struct dir_context *ctx,
struct ctl_table_header *head,
struct ctl_table *table)
{
- int err, ret = 0;
+ bool ret = true;
head = sysctl_head_grab(head);
if (S_ISLNK(table->mode)) {
/* It is not an error if we can not follow the link ignore it */
- err = sysctl_follow_link(&head, &table, current->nsproxy);
+ int err = sysctl_follow_link(&head, &table, current->nsproxy);
if (err)
goto out;
}
- ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+ ret = proc_sys_fill_cache(file, ctx, head, table);
out:
sysctl_head_finish(head);
return ret;
@@ -634,67 +634,50 @@ out:
static int scan(struct ctl_table_header *head, ctl_table *table,
unsigned long *pos, struct file *file,
- void *dirent, filldir_t filldir)
+ struct dir_context *ctx)
{
- int res;
+ bool res;
- if ((*pos)++ < file->f_pos)
- return 0;
+ if ((*pos)++ < ctx->pos)
+ return true;
if (unlikely(S_ISLNK(table->mode)))
- res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+ res = proc_sys_link_fill_cache(file, ctx, head, table);
else
- res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+ res = proc_sys_fill_cache(file, ctx, head, table);
- if (res == 0)
- file->f_pos = *pos;
+ if (res)
+ ctx->pos = *pos;
return res;
}
-static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
struct ctl_table *entry;
struct ctl_dir *ctl_dir;
unsigned long pos;
- int ret = -EINVAL;
if (IS_ERR(head))
return PTR_ERR(head);
ctl_dir = container_of(head, struct ctl_dir, header);
- ret = 0;
- /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos,
- inode->i_ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- }
- if (filp->f_pos == 1) {
- if (filldir(dirent, "..", 2, filp->f_pos,
- parent_ino(dentry), DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
pos = 2;
for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
- ret = scan(h, entry, &pos, filp, dirent, filldir);
- if (ret) {
+ if (!scan(h, entry, &pos, file, ctx)) {
sysctl_head_finish(h);
break;
}
}
- ret = 1;
-out:
sysctl_head_finish(head);
- return ret;
+ return 0;
}
static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
static const struct file_operations proc_sys_dir_file_operations = {
.read = generic_read_dir,
- .readdir = proc_sys_readdir,
+ .iterate = proc_sys_readdir,
.llseek = generic_file_llseek,
};
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41a6ea93f486..229e366598da 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -202,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
return proc_pid_lookup(dir, dentry, flags);
}
-static int proc_root_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned int nr = filp->f_pos;
- int ret;
-
- if (nr < FIRST_PROCESS_ENTRY) {
- int error = proc_readdir(filp, dirent, filldir);
- if (error <= 0)
- return error;
- filp->f_pos = FIRST_PROCESS_ENTRY;
+ if (ctx->pos < FIRST_PROCESS_ENTRY) {
+ proc_readdir(file, ctx);
+ ctx->pos = FIRST_PROCESS_ENTRY;
}
- ret = proc_pid_readdir(filp, dirent, filldir);
- return ret;
+ return proc_pid_readdir(file, ctx);
}
/*
@@ -226,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
*/
static const struct file_operations proc_root_operations = {
.read = generic_read_dir,
- .readdir = proc_root_readdir,
+ .iterate = proc_root_readdir,
.llseek = default_llseek,
};
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 28ce014b3cef..b218f965817b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -14,9 +14,9 @@
#include <linux/buffer_head.h>
#include "qnx4.h"
-static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx4_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
unsigned int offset;
struct buffer_head *bh;
struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
int size;
QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
- QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
+ QNX4DEBUG((KERN_INFO "pos = %ld\n", (long) ctx->pos));
- while (filp->f_pos < inode->i_size) {
- blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
+ while (ctx->pos < inode->i_size) {
+ blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
bh = sb_bread(inode->i_sb, blknum);
- if(bh==NULL) {
+ if (bh == NULL) {
printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
- break;
+ return 0;
}
- ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
- while (ix < QNX4_INODES_PER_BLOCK) {
+ ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+ for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
offset = ix * QNX4_DIR_ENTRY_SIZE;
de = (struct qnx4_inode_entry *) (bh->b_data + offset);
- size = strlen(de->di_fname);
- if (size) {
- if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX )
- size = QNX4_SHORT_NAME_MAX;
- else if ( size > QNX4_NAME_MAX )
- size = QNX4_NAME_MAX;
-
- if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
- QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
- if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
- ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
- else {
- le = (struct qnx4_link_info*)de;
- ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
- QNX4_INODES_PER_BLOCK +
- le->dl_inode_ndx;
- }
- if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
- brelse(bh);
- goto out;
- }
- }
+ if (!de->di_fname[0])
+ continue;
+ if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+ continue;
+ if (!(de->di_status & QNX4_FILE_LINK))
+ size = QNX4_SHORT_NAME_MAX;
+ else
+ size = QNX4_NAME_MAX;
+ size = strnlen(de->di_fname, size);
+ QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+ if (!(de->di_status & QNX4_FILE_LINK))
+ ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+ else {
+ le = (struct qnx4_link_info*)de;
+ ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+ QNX4_INODES_PER_BLOCK +
+ le->dl_inode_ndx;
+ }
+ if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
+ brelse(bh);
+ return 0;
}
- ix++;
- filp->f_pos += QNX4_DIR_ENTRY_SIZE;
}
brelse(bh);
}
-out:
return 0;
}
@@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = qnx4_readdir,
+ .iterate = qnx4_readdir,
.fsync = generic_file_fsync,
};
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 8798d065e400..15b7d92ed60d 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
static int qnx6_dir_longfilename(struct inode *inode,
struct qnx6_long_dir_entry *de,
- void *dirent, loff_t pos,
- unsigned de_inode, filldir_t filldir)
+ struct dir_context *ctx,
+ unsigned de_inode)
{
struct qnx6_long_filename *lf;
struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
lf_size, lf->lf_fname, de_inode));
- if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode,
- DT_UNKNOWN) < 0) {
+ if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
qnx6_put_page(page);
return 0;
}
@@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode,
return 1;
}
-static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx6_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *s = inode->i_sb;
struct qnx6_sb_info *sbi = QNX6_SB(s);
- loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1);
+ loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
unsigned long npages = dir_pages(inode);
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
bool done = false;
- if (filp->f_pos >= inode->i_size)
+ ctx->pos = pos;
+ if (ctx->pos >= inode->i_size)
return 0;
for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (IS_ERR(page)) {
printk(KERN_ERR "qnx6_readdir: read failed\n");
- filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT;
+ ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
return PTR_ERR(page);
}
de = ((struct qnx6_dir_entry *)page_address(page)) + start;
- for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) {
+ for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
int size = de->de_size;
u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
@@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
structure / block */
if (!qnx6_dir_longfilename(inode,
(struct qnx6_long_dir_entry *)de,
- dirent, pos, no_inode,
- filldir)) {
+ ctx, no_inode)) {
done = true;
break;
}
@@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
" inode:%u\n", size, de->de_fname,
no_inode));
- if (filldir(dirent, de->de_fname, size,
- pos, no_inode, DT_UNKNOWN)
- < 0) {
+ if (!dir_emit(ctx, de->de_fname, size,
+ no_inode, DT_UNKNOWN)) {
done = true;
break;
}
@@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
qnx6_put_page(page);
}
- filp->f_pos = pos;
return 0;
}
@@ -282,7 +279,7 @@ found:
const struct file_operations qnx6_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = qnx6_readdir,
+ .iterate = qnx6_readdir,
.fsync = generic_file_fsync,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index 03430008704e..2cefa417be34 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1064,6 +1064,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
struct fd in, out;
struct inode *in_inode, *out_inode;
loff_t pos;
+ loff_t out_pos;
ssize_t retval;
int fl;
@@ -1077,12 +1078,14 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (!(in.file->f_mode & FMODE_READ))
goto fput_in;
retval = -ESPIPE;
- if (!ppos)
- ppos = &in.file->f_pos;
- else
+ if (!ppos) {
+ pos = in.file->f_pos;
+ } else {
+ pos = *ppos;
if (!(in.file->f_mode & FMODE_PREAD))
goto fput_in;
- retval = rw_verify_area(READ, in.file, ppos, count);
+ }
+ retval = rw_verify_area(READ, in.file, &pos, count);
if (retval < 0)
goto fput_in;
count = retval;
@@ -1099,7 +1102,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
retval = -EINVAL;
in_inode = file_inode(in.file);
out_inode = file_inode(out.file);
- retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
+ out_pos = out.file->f_pos;
+ retval = rw_verify_area(WRITE, out.file, &out_pos, count);
if (retval < 0)
goto fput_out;
count = retval;
@@ -1107,7 +1111,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
- pos = *ppos;
if (unlikely(pos + count > max)) {
retval = -EOVERFLOW;
if (pos >= max)
@@ -1126,18 +1129,23 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (in.file->f_flags & O_NONBLOCK)
fl = SPLICE_F_NONBLOCK;
#endif
- retval = do_splice_direct(in.file, ppos, out.file, count, fl);
+ retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
if (retval > 0) {
add_rchar(current, retval);
add_wchar(current, retval);
fsnotify_access(in.file);
fsnotify_modify(out.file);
+ out.file->f_pos = out_pos;
+ if (ppos)
+ *ppos = pos;
+ else
+ in.file->f_pos = pos;
}
inc_syscr(current);
inc_syscw(current);
- if (*ppos > max)
+ if (pos > max)
retval = -EOVERFLOW;
fput_out:
diff --git a/fs/readdir.c b/fs/readdir.c
index fee38e04fae4..93d71e574310 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,11 +20,11 @@
#include <asm/uaccess.h>
-int vfs_readdir(struct file *file, filldir_t filler, void *buf)
+int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
int res = -ENOTDIR;
- if (!file->f_op || !file->f_op->readdir)
+ if (!file->f_op || !file->f_op->iterate)
goto out;
res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
- res = file->f_op->readdir(file, buf, filler);
+ ctx->pos = file->f_pos;
+ res = file->f_op->iterate(file, ctx);
+ file->f_pos = ctx->pos;
file_accessed(file);
}
mutex_unlock(&inode->i_mutex);
out:
return res;
}
-
-EXPORT_SYMBOL(vfs_readdir);
+EXPORT_SYMBOL(iterate_dir);
/*
* Traditional linux readdir() handling..
@@ -66,6 +67,7 @@ struct old_linux_dirent {
};
struct readdir_callback {
+ struct dir_context ctx;
struct old_linux_dirent __user * dirent;
int result;
};
@@ -73,7 +75,7 @@ struct readdir_callback {
static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
u64 ino, unsigned int d_type)
{
- struct readdir_callback * buf = (struct readdir_callback *) __buf;
+ struct readdir_callback *buf = (struct readdir_callback *) __buf;
struct old_linux_dirent __user * dirent;
unsigned long d_ino;
@@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
{
int error;
struct fd f = fdget(fd);
- struct readdir_callback buf;
+ struct readdir_callback buf = {
+ .ctx.actor = fillonedir,
+ .dirent = dirent
+ };
if (!f.file)
return -EBADF;
- buf.result = 0;
- buf.dirent = dirent;
-
- error = vfs_readdir(f.file, fillonedir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (buf.result)
error = buf.result;
@@ -137,6 +139,7 @@ struct linux_dirent {
};
struct getdents_callback {
+ struct dir_context ctx;
struct linux_dirent __user * current_dir;
struct linux_dirent __user * previous;
int count;
@@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
{
struct fd f;
struct linux_dirent __user * lastdirent;
- struct getdents_callback buf;
+ struct getdents_callback buf = {
+ .ctx.actor = filldir,
+ .count = count,
+ .current_dir = dirent
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, filldir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- if (put_user(f.file->f_pos, &lastdirent->d_off))
+ if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
error = count - buf.count;
@@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
}
struct getdents_callback64 {
+ struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
struct linux_dirent64 __user * previous;
int count;
@@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
{
struct fd f;
struct linux_dirent64 __user * lastdirent;
- struct getdents_callback64 buf;
+ struct getdents_callback64 buf = {
+ .ctx.actor = filldir64,
+ .count = count,
+ .current_dir = dirent
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, filldir64, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- typeof(lastdirent->d_off) d_off = f.file->f_pos;
+ typeof(lastdirent->d_off) d_off = buf.ctx.pos;
if (__put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 66c53b642a88..03e4ca5624d6 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -13,14 +13,14 @@
extern const struct reiserfs_key MIN_KEY;
-static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_readdir(struct file *, struct dir_context *);
static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
int datasync);
const struct file_operations reiserfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = reiserfs_readdir,
+ .iterate = reiserfs_readdir,
.fsync = reiserfs_dir_fsync,
.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
#define store_ih(where,what) copy_item_head (where, what)
-static inline bool is_privroot_deh(struct dentry *dir,
- struct reiserfs_de_head *deh)
+static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
{
- struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
- return (dir == dir->d_parent && privroot->d_inode &&
+ struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
+ return (privroot->d_inode &&
deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
}
-int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
- filldir_t filldir, loff_t *pos)
+int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
{
- struct inode *inode = dentry->d_inode;
struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
INITIALIZE_PATH(path_to_entry);
struct buffer_head *bh;
@@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
/* form key for search the next directory entry using f_pos field of
file structure */
- make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+ make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
next_pos = cpu_key_k_offset(&pos_key);
path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
entry_num++, deh++) {
int d_reclen;
char *d_name;
- off_t d_off;
ino_t d_ino;
if (!de_visible(deh))
@@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
}
/* Ignore the .reiserfs_priv entry */
- if (is_privroot_deh(dentry, deh))
+ if (is_privroot_deh(inode, deh))
continue;
- d_off = deh_offset(deh);
- *pos = d_off;
+ ctx->pos = deh_offset(deh);
d_ino = deh_objectid(deh);
if (d_reclen <= 32) {
local_buf = small_buf;
@@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
* the write lock here for other waiters
*/
reiserfs_write_unlock(inode->i_sb);
- if (filldir
- (dirent, local_buf, d_reclen, d_off, d_ino,
- DT_UNKNOWN) < 0) {
+ if (!dir_emit
+ (ctx, local_buf, d_reclen, d_ino,
+ DT_UNKNOWN)) {
reiserfs_write_lock(inode->i_sb);
if (local_buf != small_buf) {
kfree(local_buf);
@@ -204,6 +199,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
next_pos = deh_offset(deh) + 1;
if (item_moved(&tmp_ih, &path_to_entry)) {
+ set_cpu_key_k_offset(&pos_key,
+ next_pos);
goto research;
}
} /* for */
@@ -235,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
} /* while */
end:
- *pos = next_pos;
+ ctx->pos = next_pos;
pathrelse(&path_to_entry);
reiserfs_check_path(&path_to_entry);
out:
@@ -243,10 +240,9 @@ out:
return ret;
}
-static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = file->f_path.dentry;
- return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
+ return reiserfs_readdir_inode(file_inode(file), ctx);
}
/* compose directory item containing "." and ".." entries (entries are
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 77d6d47abc83..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1811,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
- if (insert_inode_locked4(inode, args.objectid,
- reiserfs_find_actor, &args) < 0) {
+
+ reiserfs_write_unlock(inode->i_sb);
+ err = insert_inode_locked4(inode, args.objectid,
+ reiserfs_find_actor, &args);
+ reiserfs_write_lock(inode->i_sb);
+ if (err) {
err = -EINVAL;
goto out_bad_inode;
}
+
if (old_format_only(sb))
/* not a perfect generation count, as object ids can be reused, but
** this is as good as reiserfs can do right now.
@@ -2970,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
}
/* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
struct inode *inode = page->mapping->host;
unsigned int curr_off = 0;
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int ret = 1;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
@@ -2991,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ goto out;
+
/*
* is this block fully invalidated?
*/
@@ -3009,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (!offset && ret) {
+ if (!partial_page && ret) {
ret = try_to_release_page(page, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474ab303..3df5ce6c724d 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
extern const struct inode_operations reiserfs_symlink_inode_operations;
extern const struct inode_operations reiserfs_special_inode_operations;
extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
+int reiserfs_readdir_inode(struct inode *, struct dir_context *);
/* tail_conversion.c */
int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4cce1d9552fb..c69cdd749f09 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
* modifying extended attributes. This includes operations such as permissions
* or ownership changes, object deletions, etc. */
struct reiserfs_dentry_buf {
+ struct dir_context ctx;
struct dentry *xadir;
int count;
struct dentry *dentries[8];
@@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode,
{
struct dentry *dir;
int i, err = 0;
- loff_t pos = 0;
struct reiserfs_dentry_buf buf = {
- .count = 0,
+ .ctx.actor = fill_with_dentries,
};
/* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode,
reiserfs_write_lock(inode->i_sb);
buf.xadir = dir;
- err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
- while ((err == 0 || err == -ENOSPC) && buf.count) {
- err = 0;
-
- for (i = 0; i < buf.count && buf.dentries[i]; i++) {
- int lerr = 0;
+ while (1) {
+ err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
+ if (err)
+ break;
+ if (!buf.count)
+ break;
+ for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
struct dentry *dentry = buf.dentries[i];
- if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
- lerr = action(dentry, data);
+ if (!S_ISDIR(dentry->d_inode->i_mode))
+ err = action(dentry, data);
dput(dentry);
buf.dentries[i] = NULL;
- err = lerr ?: err;
}
+ if (err)
+ break;
buf.count = 0;
- if (!err)
- err = reiserfs_readdir_dentry(dir, &buf,
- fill_with_dentries, &pos);
}
mutex_unlock(&dir->d_inode->i_mutex);
- /* Clean up after a failed readdir */
cleanup_dentry_buf(&buf);
if (!err) {
@@ -318,7 +316,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data)
static int chown_one_xattr(struct dentry *dentry, void *data)
{
struct iattr *attrs = data;
- return reiserfs_setattr(dentry, attrs);
+ int ia_valid = attrs->ia_valid;
+ int err;
+
+ /*
+ * We only want the ownership bits. Otherwise, we'll do
+ * things like change a directory to a regular file if
+ * ATTR_MODE is set.
+ */
+ attrs->ia_valid &= (ATTR_UID|ATTR_GID);
+ err = reiserfs_setattr(dentry, attrs);
+ attrs->ia_valid = ia_valid;
+
+ return err;
}
/* No i_mutex, but the inode is unconnected. */
@@ -788,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
}
struct listxattr_buf {
+ struct dir_context ctx;
size_t size;
size_t pos;
char *buf;
@@ -833,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
{
struct dentry *dir;
int err = 0;
- loff_t pos = 0;
struct listxattr_buf buf = {
+ .ctx.actor = listxattr_filler,
.dentry = dentry,
.buf = buffer,
.size = buffer ? size : 0,
@@ -856,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
}
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
- err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+ err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
mutex_unlock(&dir->d_inode->i_mutex);
if (!err)
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d7c01ef64eda..6c8767fdfc6a 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode)
int depth;
int error;
+ if (IS_PRIVATE(inode))
+ return 0;
+
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 15cbc41ee365..ff1d3d42e72a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = {
/*
* read the entries from a directory
*/
-static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int romfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *i = file_inode(filp);
+ struct inode *i = file_inode(file);
struct romfs_inode ri;
unsigned long offset, maxoff;
int j, ino, nextfh;
- int stored = 0;
char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
int ret;
maxoff = romfs_maxsize(i->i_sb);
- offset = filp->f_pos;
+ offset = ctx->pos;
if (!offset) {
offset = i->i_ino & ROMFH_MASK;
ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
for (;;) {
if (!offset || offset >= maxoff) {
offset = maxoff;
- filp->f_pos = offset;
+ ctx->pos = offset;
goto out;
}
- filp->f_pos = offset;
+ ctx->pos = offset;
/* Fetch inode info */
ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
nextfh = be32_to_cpu(ri.next);
if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
ino = be32_to_cpu(ri.spec);
- if (filldir(dirent, fsname, j, offset, ino,
- romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+ if (!dir_emit(ctx, fsname, j, ino,
+ romfs_dtype_table[nextfh & ROMFH_TYPE]))
goto out;
- stored++;
offset = nextfh & ROMFH_MASK;
}
-
out:
- return stored;
+ return 0;
}
/*
@@ -281,7 +278,7 @@ error:
static const struct file_operations romfs_dir_operations = {
.read = generic_read_dir,
- .readdir = romfs_readdir,
+ .iterate = romfs_readdir,
.llseek = default_llseek,
};
diff --git a/fs/splice.c b/fs/splice.c
index e6b25598c8c4..d37431dd60a1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1274,7 +1274,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
{
struct file *file = sd->u.file;
- return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+ return do_splice_from(pipe, file, sd->opos, sd->total_len,
sd->flags);
}
@@ -1283,6 +1283,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
* @in: file to splice from
* @ppos: input file offset
* @out: file to splice to
+ * @opos: output file offset
* @len: number of bytes to splice
* @flags: splice modifier flags
*
@@ -1294,7 +1295,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
*
*/
long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- size_t len, unsigned int flags)
+ loff_t *opos, size_t len, unsigned int flags)
{
struct splice_desc sd = {
.len = len,
@@ -1302,6 +1303,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
.flags = flags,
.pos = *ppos,
.u.file = out,
+ .opos = opos,
};
long ret;
@@ -1325,7 +1327,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
- loff_t offset, *off;
+ loff_t offset;
long ret;
ipipe = get_pipe_info(in);
@@ -1356,13 +1358,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t)))
return -EFAULT;
- off = &offset;
- } else
- off = &out->f_pos;
+ } else {
+ offset = out->f_pos;
+ }
- ret = do_splice_from(ipipe, out, off, len, flags);
+ ret = do_splice_from(ipipe, out, &offset, len, flags);
- if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+ if (!off_out)
+ out->f_pos = offset;
+ else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
ret = -EFAULT;
return ret;
@@ -1376,13 +1380,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
- off = &offset;
- } else
- off = &in->f_pos;
+ } else {
+ offset = in->f_pos;
+ }
- ret = do_splice_to(in, off, opipe, len, flags);
+ ret = do_splice_to(in, &offset, opipe, len, flags);
- if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+ if (!off_in)
+ in->f_pos = offset;
+ else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
ret = -EFAULT;
return ret;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 57dc70ebbb19..f7f527bf8c10 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
}
-static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int squashfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
@@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
* It also means that the external f_pos is offset by 3 from the
* on-disk directory f_pos.
*/
- while (file->f_pos < 3) {
+ while (ctx->pos < 3) {
char *name;
int i_ino;
- if (file->f_pos == 0) {
+ if (ctx->pos == 0) {
name = ".";
size = 1;
i_ino = inode->i_ino;
@@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
i_ino = squashfs_i(inode)->parent;
}
- TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
- dirent, name, size, file->f_pos, i_ino,
- squashfs_filetype_table[1]);
-
- if (filldir(dirent, name, size, file->f_pos, i_ino,
- squashfs_filetype_table[1]) < 0) {
- TRACE("Filldir returned less than 0\n");
+ if (!dir_emit(ctx, name, size, i_ino,
+ squashfs_filetype_table[1]))
goto finish;
- }
- file->f_pos += size;
+ ctx->pos += size;
}
length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
squashfs_i(inode)->dir_idx_start,
squashfs_i(inode)->dir_idx_offset,
squashfs_i(inode)->dir_idx_cnt,
- file->f_pos);
+ ctx->pos);
while (length < i_size_read(inode)) {
/*
@@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
length += sizeof(*dire) + size;
- if (file->f_pos >= length)
+ if (ctx->pos >= length)
continue;
dire->name[size] = '\0';
@@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
((short) le16_to_cpu(dire->inode_number));
type = le16_to_cpu(dire->type);
- TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
- "\n", dirent, dire->name, size,
- file->f_pos,
- le32_to_cpu(dirh.start_block),
- le16_to_cpu(dire->offset),
- inode_number,
- squashfs_filetype_table[type]);
-
- if (filldir(dirent, dire->name, size, file->f_pos,
+ if (!dir_emit(ctx, dire->name, size,
inode_number,
- squashfs_filetype_table[type]) < 0) {
- TRACE("Filldir returned less than 0\n");
+ squashfs_filetype_table[type]))
goto finish;
- }
- file->f_pos = length;
+ ctx->pos = length;
}
}
@@ -238,6 +222,6 @@ failed_read:
const struct file_operations squashfs_dir_ops = {
.read = generic_read_dir,
- .readdir = squashfs_readdir,
+ .iterate = squashfs_readdir,
.llseek = default_llseek,
};
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e8e0e71b29d5..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
}
/**
- * sysfs_link_subling - link sysfs_dirent into sibling rbtree
+ * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
* @sd: sysfs_dirent of interest
*
* Link @sd into its sibling rbtree which starts from
@@ -998,68 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
return pos;
}
-static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct sysfs_dirent * parent_sd = dentry->d_fsdata;
- struct sysfs_dirent *pos = filp->private_data;
+ struct sysfs_dirent *pos = file->private_data;
enum kobj_ns_type type;
const void *ns;
- ino_t ino;
- loff_t off;
type = sysfs_ns_type(parent_sd);
ns = sysfs_info(dentry->d_sb)->ns[type];
- if (filp->f_pos == 0) {
- ino = parent_sd->s_ino;
- if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
- filp->f_pos++;
- else
- return 0;
- }
- if (filp->f_pos == 1) {
- if (parent_sd->s_parent)
- ino = parent_sd->s_parent->s_ino;
- else
- ino = parent_sd->s_ino;
- if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
- filp->f_pos++;
- else
- return 0;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
mutex_lock(&sysfs_mutex);
- off = filp->f_pos;
- for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+ for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
pos;
- pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
- const char * name;
- unsigned int type;
- int len, ret;
-
- name = pos->s_name;
- len = strlen(name);
- ino = pos->s_ino;
- type = dt_type(pos);
- off = filp->f_pos = pos->s_hash;
- filp->private_data = sysfs_get(pos);
+ pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
+ const char *name = pos->s_name;
+ unsigned int type = dt_type(pos);
+ int len = strlen(name);
+ ino_t ino = pos->s_ino;
+ ctx->pos = pos->s_hash;
+ file->private_data = sysfs_get(pos);
mutex_unlock(&sysfs_mutex);
- ret = filldir(dirent, name, len, off, ino, type);
+ if (!dir_emit(ctx, name, len, ino, type))
+ return 0;
mutex_lock(&sysfs_mutex);
- if (ret < 0)
- break;
}
mutex_unlock(&sysfs_mutex);
-
- /* don't reference last entry if its refcount is dropped */
- if (!pos) {
- filp->private_data = NULL;
-
- /* EOF and not changed as 0 or 1 in read/write path */
- if (off == filp->f_pos && off > 1)
- filp->f_pos = INT_MAX;
- }
+ file->private_data = NULL;
+ ctx->pos = INT_MAX;
return 0;
}
@@ -1077,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations sysfs_dir_operations = {
.read = generic_read_dir,
- .readdir = sysfs_readdir,
+ .iterate = sysfs_readdir,
.release = sysfs_dir_release,
.llseek = sysfs_dir_llseek,
};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->event);
- wake_up_interruptible(&od->poll);
+ if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
+ od = sd->s_attr.open;
+ if (od) {
+ atomic_inc(&od->event);
+ wake_up_interruptible(&od->poll);
+ }
}
spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
#include <linux/security.h>
#include "sysfs.h"
-extern struct super_block * sysfs_sb;
-
static const struct address_space_operations sysfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 3799e8dac3eb..d42291d08215 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -18,12 +18,12 @@
#include <linux/swap.h>
#include "sysv.h"
-static int sysv_readdir(struct file *, void *, filldir_t);
+static int sysv_readdir(struct file *, struct dir_context *);
const struct file_operations sysv_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = sysv_readdir,
+ .iterate = sysv_readdir,
.fsync = generic_file_fsync,
};
@@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
return page;
}
-static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysv_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned long pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ unsigned long pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
+ unsigned offset;
+ unsigned long n;
- pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+ ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
if (pos >= inode->i_size)
- goto done;
+ return 0;
+
+ offset = pos & ~PAGE_CACHE_MASK;
+ n = pos >> PAGE_CACHE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
kaddr = (char *)page_address(page);
de = (struct sysv_dir_entry *)(kaddr+offset);
limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
- for ( ;(char*)de <= limit; de++) {
+ for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
char *name = de->name;
- int over;
if (!de->inode)
continue;
- offset = (char *)de - kaddr;
-
- over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
- ((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
+ if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
fs16_to_cpu(SYSV_SB(sb), de->inode),
- DT_UNKNOWN);
- if (over) {
+ DT_UNKNOWN)) {
dir_put_page(page);
- goto done;
+ return 0;
}
}
dir_put_page(page);
}
-
-done:
- filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
return 0;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index de08c92f2e23..6b4947f75af7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -346,38 +346,46 @@ static unsigned int vfs_dent_type(uint8_t type)
* This means that UBIFS cannot support NFS which requires full
* 'seekdir()'/'telldir()' support.
*/
-static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ubifs_readdir(struct file *file, struct dir_context *ctx)
{
- int err, over = 0;
+ int err;
struct qstr nm;
union ubifs_key key;
struct ubifs_dent_node *dent;
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
- dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+ dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
- if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+ if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
/*
* The directory was seek'ed to a senseless position or there
* are no more entries.
*/
return 0;
- /* File positions 0 and 1 correspond to "." and ".." */
- if (file->f_pos == 0) {
- ubifs_assert(!file->private_data);
- over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
- if (over)
- return 0;
- file->f_pos = 1;
+ if (file->f_version == 0) {
+ /*
+ * The file was seek'ed, which means that @file->private_data
+ * is now invalid. This may also be just the first
+ * 'ubifs_readdir()' invocation, in which case
+ * @file->private_data is NULL, and the below code is
+ * basically a no-op.
+ */
+ kfree(file->private_data);
+ file->private_data = NULL;
}
- if (file->f_pos == 1) {
+ /*
+ * 'generic_file_llseek()' unconditionally sets @file->f_version to
+ * zero, and we use this for detecting whether the file was seek'ed.
+ */
+ file->f_version = 1;
+
+ /* File positions 0 and 1 correspond to "." and ".." */
+ if (ctx->pos < 2) {
ubifs_assert(!file->private_data);
- over = filldir(dirent, "..", 2, 1,
- parent_ino(file->f_path.dentry), DT_DIR);
- if (over)
+ if (!dir_emit_dots(file, ctx))
return 0;
/* Find the first entry in TNC and save it */
@@ -389,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
goto out;
}
- file->f_pos = key_hash_flash(c, &dent->key);
+ ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
}
@@ -397,17 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
if (!dent) {
/*
* The directory was seek'ed to and is now readdir'ed.
- * Find the entry corresponding to @file->f_pos or the
- * closest one.
+ * Find the entry corresponding to @ctx->pos or the closest one.
*/
- dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+ dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
nm.name = NULL;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
goto out;
}
- file->f_pos = key_hash_flash(c, &dent->key);
+ ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
}
@@ -419,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
ubifs_inode(dir)->creat_sqnum);
nm.len = le16_to_cpu(dent->nlen);
- over = filldir(dirent, dent->name, nm.len, file->f_pos,
+ if (!dir_emit(ctx, dent->name, nm.len,
le64_to_cpu(dent->inum),
- vfs_dent_type(dent->type));
- if (over)
+ vfs_dent_type(dent->type)))
return 0;
/* Switch to the next entry */
@@ -435,7 +441,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
}
kfree(file->private_data);
- file->f_pos = key_hash_flash(c, &dent->key);
+ ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
cond_resched();
}
@@ -448,18 +454,11 @@ out:
kfree(file->private_data);
file->private_data = NULL;
- file->f_pos = 2;
+ /* 2 is a special value indicating that there are no more direntries */
+ ctx->pos = 2;
return 0;
}
-/* If a directory is seeked, we have to free saved readdir() state */
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
- kfree(file->private_data);
- file->private_data = NULL;
- return generic_file_llseek(file, offset, whence);
-}
-
/* Free saved readdir() state when the directory is closed */
static int ubifs_dir_release(struct inode *dir, struct file *file)
{
@@ -1177,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
};
const struct file_operations ubifs_dir_operations = {
- .llseek = ubifs_dir_llseek,
+ .llseek = generic_file_llseek,
.release = ubifs_dir_release,
.read = generic_read_dir,
- .readdir = ubifs_readdir,
+ .iterate = ubifs_readdir,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
}
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
ubifs_assert(PagePrivate(page));
- if (offset)
+ if (offset || length < PAGE_CACHE_SIZE)
/* Partial page remains dirty */
return;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b3e93f5e17c3..a012c51caffd 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,14 +35,16 @@
#include "udf_i.h"
#include "udf_sb.h"
-static int do_udf_readdir(struct inode *dir, struct file *filp,
- filldir_t filldir, void *dirent)
+
+static int udf_readdir(struct file *file, struct dir_context *ctx)
{
+ struct inode *dir = file_inode(file);
+ struct udf_inode_info *iinfo = UDF_I(dir);
struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
struct fileIdentDesc *fi = NULL;
struct fileIdentDesc cfi;
int block, iblock;
- loff_t nf_pos = (filp->f_pos - 1) << 2;
+ loff_t nf_pos;
int flen;
unsigned char *fname = NULL;
unsigned char *nameptr;
@@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
uint32_t elen;
sector_t offset;
int i, num, ret = 0;
- unsigned int dt_type;
struct extent_position epos = { NULL, 0, {0, 0} };
- struct udf_inode_info *iinfo;
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos = 1;
+ }
+ nf_pos = (ctx->pos - 1) << 2;
if (nf_pos >= size)
goto out;
@@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
nf_pos = udf_ext0_offset(dir);
fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
- iinfo = UDF_I(dir);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
&epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
}
while (nf_pos < size) {
- filp->f_pos = (nf_pos >> 2) + 1;
+ struct kernel_lb_addr tloc;
+
+ ctx->pos = (nf_pos >> 2) + 1;
fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
&elen, &offset);
@@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
}
if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
- iblock = parent_ino(filp->f_path.dentry);
- flen = 2;
- memcpy(fname, "..", flen);
- dt_type = DT_DIR;
- } else {
- struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-
- iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
- flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
- dt_type = DT_UNKNOWN;
+ if (!dir_emit_dotdot(file, ctx))
+ goto out;
+ continue;
}
- if (flen && filldir(dirent, fname, flen, filp->f_pos,
- iblock, dt_type) < 0)
+ flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+ if (!flen)
+ continue;
+
+ tloc = lelb_to_cpu(cfi.icb.extLocation);
+ iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+ if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
goto out;
} /* end while */
- filp->f_pos = (nf_pos >> 2) + 1;
+ ctx->pos = (nf_pos >> 2) + 1;
out:
if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@ out:
return ret;
}
-static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- struct inode *dir = file_inode(filp);
- int result;
-
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
- return 0;
- }
- filp->f_pos++;
- }
-
- result = do_udf_readdir(dir, filp, filldir, dirent);
- return result;
-}
-
/* readdir and lookup functions */
const struct file_operations udf_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = udf_readdir,
+ .iterate = udf_readdir,
.unlocked_ioctl = udf_ioctl,
.fsync = generic_file_fsync,
};
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 3a75ca09c506..0ecc2cebed8f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base,
* This is blatantly stolen from ext2fs
*/
static int
-ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+ufs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = ufs_dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
- int need_revalidate = filp->f_version != inode->i_version;
+ int need_revalidate = file->f_version != inode->i_version;
unsigned flags = UFS_SB(sb)->s_flags;
UFSD("BEGIN\n");
@@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
ufs_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return -EIO;
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
return -EIO;
}
if (de->d_ino) {
- int over;
unsigned char d_type = DT_UNKNOWN;
- offset = (char *)de - kaddr;
-
UFSD("filldir(%s,%u)\n", de->d_name,
fs32_to_cpu(sb, de->d_ino));
UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
d_type = de->d_u.d_44.d_type;
- over = filldir(dirent, de->d_name,
+ if (!dir_emit(ctx, de->d_name,
ufs_get_de_namlen(sb, de),
- (n<<PAGE_CACHE_SHIFT) | offset,
- fs32_to_cpu(sb, de->d_ino), d_type);
- if (over) {
+ fs32_to_cpu(sb, de->d_ino),
+ d_type)) {
ufs_put_page(page);
return 0;
}
}
- filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+ ctx->pos += fs16_to_cpu(sb, de->d_reclen);
}
ufs_put_page(page);
}
@@ -660,7 +656,7 @@ not_empty:
const struct file_operations ufs_dir_operations = {
.read = generic_read_dir,
- .readdir = ufs_readdir,
+ .iterate = ufs_readdir,
.fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 1d32f1d52763..306d883d89bc 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,6 +21,8 @@
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_vnodeops.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
#include "xfs_trace.h"
#include <linux/slab.h>
#include <linux/xattr.h>
@@ -34,7 +36,9 @@
*/
STATIC struct posix_acl *
-xfs_acl_from_disk(struct xfs_acl *aclp)
+xfs_acl_from_disk(
+ struct xfs_acl *aclp,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
@@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
unsigned int count, i;
count = be32_to_cpu(aclp->acl_cnt);
- if (count > XFS_ACL_MAX_ENTRIES)
+ if (count > max_entries)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type)
struct xfs_inode *ip = XFS_I(inode);
struct posix_acl *acl;
struct xfs_acl *xfs_acl;
- int len = sizeof(struct xfs_acl);
unsigned char *ea_name;
int error;
+ int len;
acl = get_cached_acl(inode, type);
if (acl != ACL_NOT_CACHED)
@@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type)
* If we have a cached ACLs value just return it, not need to
* go out to the disk.
*/
-
- xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ len = XFS_ACL_MAX_SIZE(ip->i_mount);
+ xfs_acl = kzalloc(len, GFP_KERNEL);
if (!xfs_acl)
return ERR_PTR(-ENOMEM);
@@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type)
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl);
+ acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
@@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
if (acl) {
struct xfs_acl *xfs_acl;
- int len;
+ int len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ xfs_acl = kzalloc(len, GFP_KERNEL);
if (!xfs_acl)
return -ENOMEM;
xfs_acl_to_disk(xfs_acl, acl);
- len = sizeof(struct xfs_acl) -
- (sizeof(struct xfs_acl_entry) *
- (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+ /* subtract away the unused acl entries */
+ len -= sizeof(struct xfs_acl_entry) *
+ (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
len, ATTR_ROOT);
@@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
static int
xfs_acl_exists(struct inode *inode, unsigned char *name)
{
- int len = sizeof(struct xfs_acl);
+ int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
ATTR_ROOT|ATTR_KERNOVAL) == 0);
@@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
goto out_release;
error = -EINVAL;
- if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+ if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
goto out_release;
if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 39632d941354..4016a567b83c 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,19 +22,36 @@ struct inode;
struct posix_acl;
struct xfs_inode;
-#define XFS_ACL_MAX_ENTRIES 25
#define XFS_ACL_NOT_PRESENT (-1)
/* On-disk XFS access control list structure */
+struct xfs_acl_entry {
+ __be32 ae_tag;
+ __be32 ae_id;
+ __be16 ae_perm;
+ __be16 ae_pad; /* fill the implicit hole in the structure */
+};
+
struct xfs_acl {
- __be32 acl_cnt;
- struct xfs_acl_entry {
- __be32 ae_tag;
- __be32 ae_id;
- __be16 ae_perm;
- } acl_entry[XFS_ACL_MAX_ENTRIES];
+ __be32 acl_cnt;
+ struct xfs_acl_entry acl_entry[0];
};
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+ (xfs_sb_version_hascrc(&mp->m_sb) \
+ ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ sizeof(struct xfs_acl_entry) \
+ : 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ (sizeof(struct xfs_acl) + \
+ sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2b2691b73428..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -725,6 +725,25 @@ xfs_convert_page(
(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
i_size_read(inode));
+ /*
+ * If the current map does not span the entire page we are about to try
+ * to write, then give up. The only way we can write a page that spans
+ * multiple mappings in a single writeback iteration is via the
+ * xfs_vm_writepage() function. Data integrity writeback requires the
+ * entire page to be written in a single attempt, otherwise the part of
+ * the page we don't write here doesn't get written as part of the data
+ * integrity sync.
+ *
+ * For normal writeback, we also don't attempt to write partial pages
+ * here as it simply means that write_cache_pages() will see it under
+ * writeback and ignore the page until some point in the future, at
+ * which time this will be the only page in the file that needs
+ * writeback. Hence for more optimal IO patterns, we should always
+ * avoid partial page writeback due to multiple mappings on a page here.
+ */
+ if (!xfs_imap_valid(inode, imap, end_offset))
+ goto fail_unlock_page;
+
len = 1 << inode->i_blkbits;
p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
PAGE_CACHE_SIZE);
@@ -824,10 +843,12 @@ xfs_cluster_write(
STATIC void
xfs_vm_invalidatepage(
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- trace_xfs_invalidatepage(page->mapping->host, page, offset);
- block_invalidatepage(page, offset);
+ trace_xfs_invalidatepage(page->mapping->host, page, offset,
+ length);
+ block_invalidatepage(page, offset, length);
}
/*
@@ -891,7 +912,7 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0);
+ xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
return;
}
@@ -921,7 +942,7 @@ xfs_vm_writepage(
int count = 0;
int nonblocking = 0;
- trace_xfs_writepage(inode, page, 0);
+ trace_xfs_writepage(inode, page, 0, 0);
ASSERT(page_has_buffers(page));
@@ -1152,7 +1173,7 @@ xfs_vm_releasepage(
{
int delalloc, unwritten;
- trace_xfs_releasepage(page->mapping->host, page, 0);
+ trace_xfs_releasepage(page->mapping->host, page, 0, 0);
xfs_count_page_state(page, &delalloc, &unwritten);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 08d5457c948e..31d3cd129269 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -931,20 +931,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
*/
int
xfs_attr_shortform_allfit(
- struct xfs_buf *bp,
- struct xfs_inode *dp)
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
xfs_attr_leaf_name_local_t *name_loc;
- int bytes, i;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ int bytes;
+ int i;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
- entry = &leaf->entries[0];
bytes = sizeof(struct xfs_attr_sf_hdr);
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ for (i = 0; i < leafhdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!(entry->flags & XFS_ATTR_LOCAL))
@@ -954,15 +956,15 @@ xfs_attr_shortform_allfit(
return(0);
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return(0);
- bytes += sizeof(struct xfs_attr_sf_entry)-1
+ bytes += sizeof(struct xfs_attr_sf_entry) - 1
+ name_loc->namelen
+ be16_to_cpu(name_loc->valuelen);
}
if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
(dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
- return(-1);
- return(xfs_attr_shortform_bytesfit(dp, bytes));
+ return -1;
+ return xfs_attr_shortform_bytesfit(dp, bytes);
}
/*
@@ -1410,7 +1412,7 @@ xfs_attr3_leaf_add_work(
name_rmt->valuelen = 0;
name_rmt->valueblk = 0;
args->rmtblkno = 1;
- args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
}
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -1443,11 +1445,12 @@ xfs_attr3_leaf_add_work(
STATIC void
xfs_attr3_leaf_compact(
struct xfs_da_args *args,
- struct xfs_attr3_icleaf_hdr *ichdr_d,
+ struct xfs_attr3_icleaf_hdr *ichdr_dst,
struct xfs_buf *bp)
{
- xfs_attr_leafblock_t *leaf_s, *leaf_d;
- struct xfs_attr3_icleaf_hdr ichdr_s;
+ struct xfs_attr_leafblock *leaf_src;
+ struct xfs_attr_leafblock *leaf_dst;
+ struct xfs_attr3_icleaf_hdr ichdr_src;
struct xfs_trans *trans = args->trans;
struct xfs_mount *mp = trans->t_mountp;
char *tmpbuffer;
@@ -1455,29 +1458,38 @@ xfs_attr3_leaf_compact(
trace_xfs_attr_leaf_compact(args);
tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
memset(bp->b_addr, 0, XFS_LBSIZE(mp));
+ leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+ leaf_dst = bp->b_addr;
/*
- * Copy basic information
+ * Copy the on-disk header back into the destination buffer to ensure
+ * all the information in the header that is not part of the incore
+ * header structure is preserved.
*/
- leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
- leaf_d = bp->b_addr;
- ichdr_s = *ichdr_d; /* struct copy */
- ichdr_d->firstused = XFS_LBSIZE(mp);
- ichdr_d->usedbytes = 0;
- ichdr_d->count = 0;
- ichdr_d->holes = 0;
- ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s);
- ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+ memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+ /* Initialise the incore headers */
+ ichdr_src = *ichdr_dst; /* struct copy */
+ ichdr_dst->firstused = XFS_LBSIZE(mp);
+ ichdr_dst->usedbytes = 0;
+ ichdr_dst->count = 0;
+ ichdr_dst->holes = 0;
+ ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+ ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+ ichdr_dst->freemap[0].base;
+
+
+ /* write the header back to initialise the underlying buffer */
+ xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
/*
* Copy all entry's in the same (sorted) order,
* but allocate name/value pairs packed and in sequence.
*/
- xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0,
- ichdr_s.count, mp);
+ xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0,
+ ichdr_src.count, mp);
/*
* this logs the entire buffer, but the caller must write the header
* back to the buffer when it is finished modifying it.
@@ -2179,14 +2191,24 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP);
- memset(tmp_leaf, 0, state->blocksize);
- memset(&tmphdr, 0, sizeof(tmphdr));
+ tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP);
+
+ /*
+ * Copy the header into the temp leaf so that all the stuff
+ * not in the incore header is present and gets copied back in
+ * once we've moved all the entries.
+ */
+ memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+ memset(&tmphdr, 0, sizeof(tmphdr));
tmphdr.magic = savehdr.magic;
tmphdr.forw = savehdr.forw;
tmphdr.back = savehdr.back;
tmphdr.firstused = state->blocksize;
+
+ /* write the header to the temp buffer to initialise it */
+ xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+
if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
drop_blk->bp, &drophdr)) {
xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
@@ -2330,9 +2352,11 @@ xfs_attr3_leaf_lookup_int(
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
+ args->valuelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
- be32_to_cpu(name_rmt->valuelen));
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(
+ args->dp->i_mount,
+ args->valuelen);
return XFS_ERROR(EEXIST);
}
}
@@ -2383,7 +2407,8 @@ xfs_attr3_leaf_getvalue(
ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
valuelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+ valuelen);
if (args->flags & ATTR_KERNOVAL) {
args->valuelen = valuelen;
return 0;
@@ -2709,7 +2734,8 @@ xfs_attr3_leaf_list_int(
args.valuelen = valuelen;
args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+ args.rmtblkcnt = xfs_attr3_rmt_blocks(
+ args.dp->i_mount, valuelen);
retval = xfs_attr_rmtval_get(&args);
if (retval)
return retval;
@@ -3232,7 +3258,7 @@ xfs_attr3_leaf_inactive(
name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
if (name_rmt->valueblk) {
lp->valueblk = be32_to_cpu(name_rmt->valueblk);
- lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
+ lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
be32_to_cpu(name_rmt->valuelen));
lp++;
}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index f9d7846097e2..444a7704596c 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -128,6 +128,7 @@ struct xfs_attr3_leaf_hdr {
__u8 holes;
__u8 pad1;
struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+ __be32 pad2; /* 64 bit alignment */
};
#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index dee84466dcc9..ef6b0c124528 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -47,22 +47,55 @@
* Each contiguous block has a header, so it is not just a simple attribute
* length to FSB conversion.
*/
-static int
+int
xfs_attr3_rmt_blocks(
struct xfs_mount *mp,
int attrlen)
{
- int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp,
- mp->m_sb.sb_blocksize);
- return (attrlen + buflen - 1) / buflen;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+ return (attrlen + buflen - 1) / buflen;
+ }
+ return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+ struct xfs_mount *mp,
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (bno != be64_to_cpu(rmt->rm_blkno))
+ return false;
+ if (offset != be32_to_cpu(rmt->rm_offset))
+ return false;
+ if (size != be32_to_cpu(rmt->rm_bytes))
+ return false;
+ if (ino != be64_to_cpu(rmt->rm_owner))
+ return false;
+
+ /* ok */
+ return true;
}
static bool
xfs_attr3_rmt_verify(
- struct xfs_buf *bp)
+ struct xfs_mount *mp,
+ void *ptr,
+ int fsbsize,
+ xfs_daddr_t bno)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
@@ -70,7 +103,9 @@ xfs_attr3_rmt_verify(
return false;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
return false;
- if (bp->b_bn != be64_to_cpu(rmt->rm_blkno))
+ if (be64_to_cpu(rmt->rm_blkno) != bno)
+ return false;
+ if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
return false;
if (be32_to_cpu(rmt->rm_offset) +
be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
@@ -86,17 +121,40 @@ xfs_attr3_rmt_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
+ char *ptr;
+ int len;
+ bool corrupt = false;
+ xfs_daddr_t bno;
/* no verification of non-crc buffers */
if (!xfs_sb_version_hascrc(&mp->m_sb))
return;
- if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_ATTR3_RMT_CRC_OFF) ||
- !xfs_attr3_rmt_verify(bp)) {
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= XFS_LBSIZE(mp));
+
+ while (len > 0) {
+ if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
+ XFS_ATTR3_RMT_CRC_OFF)) {
+ corrupt = true;
+ break;
+ }
+ if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+ corrupt = true;
+ break;
+ }
+ len -= XFS_LBSIZE(mp);
+ ptr += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
+ }
+
+ if (corrupt) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+ } else
+ ASSERT(len == 0);
}
static void
@@ -105,23 +163,39 @@ xfs_attr3_rmt_write_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_buf_log_item *bip = bp->b_fspriv;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
/* no verification of non-crc buffers */
if (!xfs_sb_version_hascrc(&mp->m_sb))
return;
- if (!xfs_attr3_rmt_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- return;
- }
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= XFS_LBSIZE(mp));
+
+ while (len > 0) {
+ if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+ if (bip) {
+ struct xfs_attr3_rmt_hdr *rmt;
+
+ rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+ rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF);
- if (bip) {
- struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
- rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ len -= XFS_LBSIZE(mp);
+ ptr += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
}
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_ATTR3_RMT_CRC_OFF);
+ ASSERT(len == 0);
}
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
@@ -129,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
.verify_write = xfs_attr3_rmt_write_verify,
};
-static int
+STATIC int
xfs_attr3_rmt_hdr_set(
struct xfs_mount *mp,
+ void *ptr,
xfs_ino_t ino,
uint32_t offset,
uint32_t size,
- struct xfs_buf *bp)
+ xfs_daddr_t bno)
{
- struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
if (!xfs_sb_version_hascrc(&mp->m_sb))
return 0;
@@ -147,36 +222,107 @@ xfs_attr3_rmt_hdr_set(
rmt->rm_bytes = cpu_to_be32(size);
uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
rmt->rm_owner = cpu_to_be64(ino);
- rmt->rm_blkno = cpu_to_be64(bp->b_bn);
- bp->b_ops = &xfs_attr3_rmt_buf_ops;
+ rmt->rm_blkno = cpu_to_be64(bno);
return sizeof(struct xfs_attr3_rmt_hdr);
}
/*
- * Checking of the remote attribute header is split into two parts. the verifier
- * does CRC, location and bounds checking, the unpacking function checks the
- * attribute parameters and owner.
+ * Helper functions to copy attribute data in and out of the one disk extents
*/
-static bool
-xfs_attr3_rmt_hdr_ok(
- struct xfs_mount *mp,
- xfs_ino_t ino,
- uint32_t offset,
- uint32_t size,
- struct xfs_buf *bp)
+STATIC int
+xfs_attr_rmtval_copyout(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ char **dst)
{
- struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
+ char *src = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
- if (offset != be32_to_cpu(rmt->rm_offset))
- return false;
- if (size != be32_to_cpu(rmt->rm_bytes))
- return false;
- if (ino != be64_to_cpu(rmt->rm_owner))
- return false;
+ ASSERT(len >= XFS_LBSIZE(mp));
- /* ok */
- return true;
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size = 0;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+
+ byte_cnt = min_t(int, *valuelen, byte_cnt);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
+ byte_cnt, bno)) {
+ xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+ bno, *offset, byte_cnt, ino);
+ return EFSCORRUPTED;
+ }
+ hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+ }
+
+ memcpy(*dst, src + hdr_size, byte_cnt);
+
+ /* roll buffer forwards */
+ len -= XFS_LBSIZE(mp);
+ src += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *dst += byte_cnt;
+ *offset += byte_cnt;
+ }
+ return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ char **src)
+{
+ char *dst = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
+
+ ASSERT(len >= XFS_LBSIZE(mp));
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+
+ byte_cnt = min(*valuelen, byte_cnt);
+ hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+ byte_cnt, bno);
+
+ memcpy(dst + hdr_size, *src, byte_cnt);
+
+ /*
+ * If this is the last block, zero the remainder of it.
+ * Check that we are actually the last block, too.
+ */
+ if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) {
+ ASSERT(*valuelen - byte_cnt == 0);
+ ASSERT(len == XFS_LBSIZE(mp));
+ memset(dst + hdr_size + byte_cnt, 0,
+ XFS_LBSIZE(mp) - hdr_size - byte_cnt);
+ }
+
+ /* roll buffer forwards */
+ len -= XFS_LBSIZE(mp);
+ dst += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *src += byte_cnt;
+ *offset += byte_cnt;
+ }
}
/*
@@ -190,13 +336,12 @@ xfs_attr_rmtval_get(
struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE];
struct xfs_mount *mp = args->dp->i_mount;
struct xfs_buf *bp;
- xfs_daddr_t dblkno;
xfs_dablk_t lblkno = args->rmtblkno;
- void *dst = args->value;
+ char *dst = args->value;
int valuelen = args->valuelen;
int nmap;
int error;
- int blkcnt;
+ int blkcnt = args->rmtblkcnt;
int i;
int offset = 0;
@@ -207,52 +352,36 @@ xfs_attr_rmtval_get(
while (valuelen > 0) {
nmap = ATTR_RMTVALUE_MAPSIZE;
error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, map, &nmap,
+ blkcnt, map, &nmap,
XFS_BMAPI_ATTRFORK);
if (error)
return error;
ASSERT(nmap >= 1);
for (i = 0; (i < nmap) && (valuelen > 0); i++) {
- int byte_cnt;
- char *src;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
- blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- dblkno, blkcnt, 0, &bp,
+ dblkno, dblkcnt, 0, &bp,
&xfs_attr3_rmt_buf_ops);
if (error)
return error;
- byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length));
- byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt);
-
- src = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino,
- offset, byte_cnt, bp)) {
- xfs_alert(mp,
-"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
- offset, byte_cnt, args->dp->i_ino);
- xfs_buf_relse(bp);
- return EFSCORRUPTED;
-
- }
-
- src += sizeof(struct xfs_attr3_rmt_hdr);
- }
-
- memcpy(dst, src, byte_cnt);
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ &offset, &valuelen,
+ &dst);
xfs_buf_relse(bp);
+ if (error)
+ return error;
- offset += byte_cnt;
- dst += byte_cnt;
- valuelen -= byte_cnt;
-
+ /* roll attribute extent map forwards */
lblkno += map[i].br_blockcount;
+ blkcnt -= map[i].br_blockcount;
}
}
ASSERT(valuelen == 0);
@@ -270,17 +399,13 @@ xfs_attr_rmtval_set(
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
struct xfs_bmbt_irec map;
- struct xfs_buf *bp;
- xfs_daddr_t dblkno;
xfs_dablk_t lblkno;
xfs_fileoff_t lfileoff = 0;
- void *src = args->value;
+ char *src = args->value;
int blkcnt;
int valuelen;
int nmap;
int error;
- int hdrcnt = 0;
- bool crcs = xfs_sb_version_hascrc(&mp->m_sb);
int offset = 0;
trace_xfs_attr_rmtval_set(args);
@@ -289,24 +414,14 @@ xfs_attr_rmtval_set(
* Find a "hole" in the attribute address space large enough for
* us to drop the new attribute's value into. Because CRC enable
* attributes have headers, we can't just do a straight byte to FSB
- * conversion. We calculate the worst case block count in this case
- * and we may not need that many, so we have to handle this when
- * allocating the blocks below.
+ * conversion and have to take the header space into account.
*/
- if (!crcs)
- blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
- else
- blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
-
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
XFS_ATTR_FORK);
if (error)
return error;
- /* Start with the attribute data. We'll allocate the rest afterwards. */
- if (crcs)
- blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
-
args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
args->rmtblkcnt = blkcnt;
@@ -349,26 +464,6 @@ xfs_attr_rmtval_set(
(map.br_startblock != HOLESTARTBLOCK));
lblkno += map.br_blockcount;
blkcnt -= map.br_blockcount;
- hdrcnt++;
-
- /*
- * If we have enough blocks for the attribute data, calculate
- * how many extra blocks we need for headers. We might run
- * through this multiple times in the case that the additional
- * headers in the blocks needed for the data fragments spills
- * into requiring more blocks. e.g. for 512 byte blocks, we'll
- * spill for another block every 9 headers we require in this
- * loop.
- */
- if (crcs && blkcnt == 0) {
- int total_len;
-
- total_len = args->valuelen +
- hdrcnt * sizeof(struct xfs_attr3_rmt_hdr);
- blkcnt = XFS_B_TO_FSB(mp, total_len);
- blkcnt -= args->rmtblkcnt;
- args->rmtblkcnt += blkcnt;
- }
/*
* Start the next trans in the chain.
@@ -385,18 +480,19 @@ xfs_attr_rmtval_set(
* the INCOMPLETE flag.
*/
lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
valuelen = args->valuelen;
while (valuelen > 0) {
- int byte_cnt;
- char *buf;
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT(blkcnt > 0);
- /*
- * Try to remember where we decided to put the value.
- */
xfs_bmap_init(args->flist, args->firstblock);
nmap = 1;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, &map, &nmap,
+ blkcnt, &map, &nmap,
XFS_BMAPI_ATTRFORK);
if (error)
return(error);
@@ -405,41 +501,27 @@ xfs_attr_rmtval_set(
(map.br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
if (!bp)
return ENOMEM;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
- byte_cnt = BBTOB(bp->b_length);
- byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt);
- if (valuelen < byte_cnt)
- byte_cnt = valuelen;
-
- buf = bp->b_addr;
- buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset,
- byte_cnt, bp);
- memcpy(buf, src, byte_cnt);
-
- if (byte_cnt < BBTOB(bp->b_length))
- xfs_buf_zero(bp, byte_cnt,
- BBTOB(bp->b_length) - byte_cnt);
+ xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+ &valuelen, &src);
error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
xfs_buf_relse(bp);
if (error)
return error;
- src += byte_cnt;
- valuelen -= byte_cnt;
- offset += byte_cnt;
- hdrcnt--;
+ /* roll attribute extent map forwards */
lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
}
ASSERT(valuelen == 0);
- ASSERT(hdrcnt == 0);
return 0;
}
@@ -448,33 +530,40 @@ xfs_attr_rmtval_set(
* out-of-line buffer that it is stored on.
*/
int
-xfs_attr_rmtval_remove(xfs_da_args_t *args)
+xfs_attr_rmtval_remove(
+ struct xfs_da_args *args)
{
- xfs_mount_t *mp;
- xfs_bmbt_irec_t map;
- xfs_buf_t *bp;
- xfs_daddr_t dblkno;
- xfs_dablk_t lblkno;
- int valuelen, blkcnt, nmap, error, done, committed;
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_dablk_t lblkno;
+ int blkcnt;
+ int error;
+ int done;
trace_xfs_attr_rmtval_remove(args);
- mp = args->dp->i_mount;
-
/*
- * Roll through the "value", invalidating the attribute value's
- * blocks.
+ * Roll through the "value", invalidating the attribute value's blocks.
+ * Note that args->rmtblkcnt is the minimum number of data blocks we'll
+ * see for a CRC enabled remote attribute. Each extent will have a
+ * header, and so we may have more blocks than we realise here. If we
+ * fail to map the blocks correctly, we'll have problems with the buffer
+ * lookups.
*/
lblkno = args->rmtblkno;
- valuelen = args->rmtblkcnt;
- while (valuelen > 0) {
+ blkcnt = args->rmtblkcnt;
+ while (blkcnt > 0) {
+ struct xfs_bmbt_irec map;
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+ int nmap;
+
/*
* Try to remember where we decided to put the value.
*/
nmap = 1;
error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, &map, &nmap,
- XFS_BMAPI_ATTRFORK);
+ blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error)
return(error);
ASSERT(nmap == 1);
@@ -482,21 +571,20 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
(map.br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
/*
* If the "remote" value is in the cache, remove it.
*/
- bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
+ bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
if (bp) {
xfs_buf_stale(bp);
xfs_buf_relse(bp);
bp = NULL;
}
- valuelen -= map.br_blockcount;
-
lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
}
/*
@@ -506,6 +594,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
blkcnt = args->rmtblkcnt;
done = 0;
while (!done) {
+ int committed;
+
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
index c7cca60a062a..92a8fd7977cc 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -20,6 +20,14 @@
#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
+/*
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
struct xfs_attr3_rmt_hdr {
__be32 rm_magic;
__be32 rm_offset;
@@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr {
extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+
int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_set(struct xfs_da_args *args);
int xfs_attr_rmtval_remove(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 8804b8a3c310..0903960410a2 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -2544,7 +2544,17 @@ xfs_btree_new_iroot(
if (error)
goto error0;
+ /*
+ * we can't just memcpy() the root in for CRC enabled btree blocks.
+ * In that case have to also ensure the blkno remains correct
+ */
memcpy(cblock, block, xfs_btree_block_len(cur));
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+ else
+ cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+ }
be16_add_cpu(&block->bb_level, 1);
xfs_btree_set_numrecs(block, 1);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 82b70bda9f47..1b2472a46e46 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -513,6 +513,7 @@ _xfs_buf_find(
xfs_alert(btp->bt_mount,
"%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
__func__, blkno, eofs);
+ WARN_ON(1);
return NULL;
}
@@ -1649,7 +1650,7 @@ xfs_alloc_buftarg(
{
xfs_buftarg_t *btp;
- btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+ btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cf263476d6b4..4ec431777048 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -262,12 +262,7 @@ xfs_buf_item_format_segment(
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-/*
- * You would think we need to bump the nvecs here too, but we do not
- * this number is used by recovery, and it gets confused by the boundary
- * split here
- * nvecs++;
- */
+ nvecs++;
vecp++;
first_bit = next_bit;
last_bit = next_bit;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9b26a99ebfe9..0b8b2a13cd24 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -270,6 +270,7 @@ xfs_da3_node_read_verify(
break;
return;
case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
bp->b_ops = &xfs_attr3_leaf_buf_ops;
bp->b_ops->verify_read(bp);
return;
@@ -2464,7 +2465,8 @@ xfs_buf_map_from_irec(
ASSERT(nirecs >= 1);
if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP);
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+ KM_SLEEP | KM_NOFS);
if (!map)
return ENOMEM;
*mapp = map;
@@ -2520,7 +2522,8 @@ xfs_dabuf_map(
* Optimize the one-block case.
*/
if (nfsb != 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP);
+ irecs = kmem_zalloc(sizeof(irec) * nfsb,
+ KM_SLEEP | KM_NOFS);
nirecs = nfsb;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f852b082a084..c407e1ccff43 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -219,6 +219,14 @@ xfs_swap_extents(
int taforkblks = 0;
__uint64_t tmp;
+ /*
+ * We have no way of updating owner information in the BMBT blocks for
+ * each inode on CRC enabled filesystems, so to avoid corrupting the
+ * this metadata we simply don't allow extent swaps to occur.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return XFS_ERROR(EINVAL);
+
tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
if (!tempifp) {
error = XFS_ERROR(ENOMEM);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921d..8f023dee404d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -368,10 +368,8 @@ xfs_dir_removename(
int
xfs_readdir(
xfs_inode_t *dp,
- void *dirent,
- size_t bufsize,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx,
+ size_t bufsize)
{
int rval; /* return value */
int v; /* type-checking value */
@@ -385,14 +383,13 @@ xfs_readdir(
XFS_STATS_INC(xs_dir_getdents);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+ rval = xfs_dir2_sf_getdents(dp, ctx);
else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
;
else if (v)
- rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+ rval = xfs_dir2_block_getdents(dp, ctx);
else
- rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
- filldir);
+ rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
return rval;
}
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e59f5fc816fe..09aea0247d96 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -569,9 +569,7 @@ xfs_dir2_block_addname(
int /* error */
xfs_dir2_block_getdents(
xfs_inode_t *dp, /* incore inode */
- void *dirent,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx)
{
xfs_dir2_data_hdr_t *hdr; /* block header */
struct xfs_buf *bp; /* buffer for block */
@@ -589,7 +587,7 @@ xfs_dir2_block_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
return 0;
error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -600,7 +598,7 @@ xfs_dir2_block_getdents(
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
*/
- wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+ wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
@@ -639,13 +637,12 @@ xfs_dir2_block_getdents(
cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
(char *)dep - (char *)hdr);
+ ctx->pos = cook & 0x7fffffff;
/*
* If it didn't fit, set the final offset to here & return.
*/
- if (filldir(dirent, (char *)dep->name, dep->namelen,
- cook & 0x7fffffff, be64_to_cpu(dep->inumber),
- DT_UNKNOWN)) {
- *offset = cook & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+ be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
xfs_trans_brelse(NULL, bp);
return 0;
}
@@ -655,7 +652,7 @@ xfs_dir2_block_getdents(
* Reached the end of the block.
* Set the offset to a non-existent block 1 and return.
*/
- *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+ ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
0x7fffffff;
xfs_trans_brelse(NULL, bp);
return 0;
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index a3b1bd841a80..7826782b8d78 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -266,6 +266,7 @@ struct xfs_dir3_blk_hdr {
struct xfs_dir3_data_hdr {
struct xfs_dir3_blk_hdr hdr;
xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT];
+ __be32 pad; /* 64 bit alignment */
};
#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
@@ -477,7 +478,7 @@ struct xfs_dir3_leaf_hdr {
struct xfs_da3_blkinfo info; /* header for da routines */
__be16 count; /* count of entries */
__be16 stale; /* count of stale entries */
- __be32 pad;
+ __be32 pad; /* 64 bit alignment */
};
struct xfs_dir3_icleaf_hdr {
@@ -715,6 +716,7 @@ struct xfs_dir3_free_hdr {
__be32 firstdb; /* db of first entry */
__be32 nvalid; /* count of valid entries */
__be32 nused; /* count of used entries */
+ __be32 pad; /* 64 bit alignment */
};
struct xfs_dir3_free {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 721ba2fe8e54..e0cc1243a8aa 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1300,10 +1300,8 @@ out:
int /* error */
xfs_dir2_leaf_getdents(
xfs_inode_t *dp, /* incore directory inode */
- void *dirent,
- size_t bufsize,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx,
+ size_t bufsize)
{
struct xfs_buf *bp = NULL; /* data block buffer */
xfs_dir2_data_hdr_t *hdr; /* data block header */
@@ -1322,7 +1320,7 @@ xfs_dir2_leaf_getdents(
* If the offset is at or past the largest allowed value,
* give up right away.
*/
- if (*offset >= XFS_DIR2_MAX_DATAPTR)
+ if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
return 0;
mp = dp->i_mount;
@@ -1336,14 +1334,14 @@ xfs_dir2_leaf_getdents(
mp->m_sb.sb_blocksize);
map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
(length * sizeof(struct xfs_bmbt_irec)),
- KM_SLEEP);
+ KM_SLEEP | KM_NOFS);
map_info->map_size = length;
/*
* Inside the loop we keep the main offset value as a byte offset
* in the directory file.
*/
- curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+ curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
/*
* Force this conversion through db so we truncate the offset
@@ -1444,8 +1442,8 @@ xfs_dir2_leaf_getdents(
dep = (xfs_dir2_data_entry_t *)ptr;
length = xfs_dir2_data_entsize(dep->namelen);
- if (filldir(dirent, (char *)dep->name, dep->namelen,
- xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+ ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber), DT_UNKNOWN))
break;
@@ -1462,9 +1460,9 @@ xfs_dir2_leaf_getdents(
* All done. Set output offset value to current offset.
*/
if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
- *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+ ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
else
- *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
kmem_free(map_info);
if (bp)
xfs_trans_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5246de4912d4..2226a00acd15 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -263,18 +263,19 @@ xfs_dir3_free_get_buf(
* Initialize the new block to be empty, and remember
* its first slot as our empty slot.
*/
- hdr.magic = XFS_DIR2_FREE_MAGIC;
- hdr.firstdb = 0;
- hdr.nused = 0;
- hdr.nvalid = 0;
+ memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+ memset(&hdr, 0, sizeof(hdr));
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
hdr.magic = XFS_DIR3_FREE_MAGIC;
+
hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
- }
+ } else
+ hdr.magic = XFS_DIR2_FREE_MAGIC;
xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr);
*bpp = bp;
return 0;
@@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int(
*/
freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
xfs_dir3_free_max_bests(mp);
- free->hdr.nvalid = 0;
- free->hdr.nused = 0;
} else {
free = fbp->b_addr;
bests = xfs_dir3_free_bests_p(mp, free);
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7cf573c88aad..0511cda4a712 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -33,8 +33,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
- xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
+ struct dir_context *ctx);
extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
extern int xfs_dir2_block_removename(struct xfs_da_args *args);
extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -91,8 +91,8 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
struct xfs_dir2_leaf_entry *ents, int *indexp,
int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
- size_t bufsize, xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
+ size_t bufsize);
extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
struct xfs_buf **bpp, __uint16_t magic);
extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -153,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
int size, xfs_dir2_sf_hdr_t *sfhp);
extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
- xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 6157424dbf8f..97676a347da1 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -768,9 +768,7 @@ xfs_dir2_sf_create(
int /* error */
xfs_dir2_sf_getdents(
xfs_inode_t *dp, /* incore directory inode */
- void *dirent,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx)
{
int i; /* shortform entry number */
xfs_mount_t *mp; /* filesystem mount point */
@@ -802,7 +800,7 @@ xfs_dir2_sf_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
return 0;
/*
@@ -819,22 +817,20 @@ xfs_dir2_sf_getdents(
/*
* Put . entry unless we're starting past it.
*/
- if (*offset <= dot_offset) {
- if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
- *offset = dot_offset & 0x7fffffff;
+ if (ctx->pos <= dot_offset) {
+ ctx->pos = dot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
return 0;
- }
}
/*
* Put .. entry unless we're starting past it.
*/
- if (*offset <= dotdot_offset) {
+ if (ctx->pos <= dotdot_offset) {
ino = xfs_dir2_sf_get_parent_ino(sfp);
- if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
- *offset = dotdot_offset & 0x7fffffff;
+ ctx->pos = dotdot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
return 0;
- }
}
/*
@@ -845,21 +841,20 @@ xfs_dir2_sf_getdents(
off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
xfs_dir2_sf_get_offset(sfep));
- if (*offset > off) {
+ if (ctx->pos > off) {
sfep = xfs_dir2_sf_nextentry(sfp, sfep);
continue;
}
ino = xfs_dir2_sfe_get_ino(sfp, sfep);
- if (filldir(dirent, (char *)sfep->name, sfep->namelen,
- off & 0x7fffffff, ino, DT_UNKNOWN)) {
- *offset = off & 0x7fffffff;
+ ctx->pos = off & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
+ ino, DT_UNKNOWN))
return 0;
- }
sfep = xfs_dir2_sf_nextentry(sfp, sfep);
}
- *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+ ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
0x7fffffff;
return 0;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a41f8bf1da37..044e97a33c8d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
}
xfs_trans_dquot_buf(tp, bp,
@@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
}
-STATIC void
-xfs_dquot_buf_calc_crc(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
- int i;
-
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return;
-
- for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) {
- xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
- offsetof(struct xfs_dqblk, dd_crc));
- }
-}
-
STATIC bool
xfs_dquot_buf_verify_crc(
struct xfs_mount *mp,
@@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc(
for (i = 0; i < ndquots; i++, d++) {
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
- offsetof(struct xfs_dqblk, dd_crc)))
+ XFS_DQUOT_CRC_OFF))
return false;
if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
return false;
}
-
return true;
}
@@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify(
}
}
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
void
xfs_dquot_buf_write_verify(
struct xfs_buf *bp)
@@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify(
xfs_buf_ioerror(bp, EFSCORRUPTED);
return;
}
- xfs_dquot_buf_calc_crc(mp, bp);
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
@@ -1151,11 +1140,17 @@ xfs_qm_dqflush(
* copy the lsn into the on-disk dquot now while we have the in memory
* dquot here. This can't be done later in the write verifier as we
* can't get access to the log item at that point in time.
+ *
+ * We also calculate the CRC here so that the on-disk dquot in the
+ * buffer always has a valid CRC. This ensures there is no possibility
+ * of a dquot without an up-to-date CRC getting to disk.
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
}
/*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index c0f375087efc..452920a3f03f 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -305,11 +305,12 @@ xfs_efi_release(xfs_efi_log_item_t *efip,
{
ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
- __xfs_efi_release(efip);
-
/* recovery needs us to drop the EFI reference, too */
if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
__xfs_efi_release(efip);
+
+ __xfs_efi_release(efip);
+ /* efip may now have been freed, do not reference it again. */
}
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a5f2042aec8b..0ad2b95fca12 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -906,11 +906,10 @@ xfs_file_release(
STATIC int
xfs_file_readdir(
- struct file *filp,
- void *dirent,
- filldir_t filldir)
+ struct file *file,
+ struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
xfs_inode_t *ip = XFS_I(inode);
int error;
size_t bufsize;
@@ -929,8 +928,7 @@ xfs_file_readdir(
*/
bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
- error = xfs_readdir(ip, dirent, bufsize,
- (xfs_off_t *)&filp->f_pos, filldir);
+ error = xfs_readdir(ip, ctx, bufsize);
if (error)
return -error;
return 0;
@@ -1432,7 +1430,7 @@ const struct file_operations xfs_file_operations = {
const struct file_operations xfs_dir_file_operations = {
.open = xfs_dir_open,
.read = generic_read_dir,
- .readdir = xfs_file_readdir,
+ .iterate = xfs_file_readdir,
.llseek = generic_file_llseek,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 6dda3f949b04..d04695545397 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 87595b211da1..3c3644ea825b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -99,7 +99,9 @@ xfs_fs_geometry(
(xfs_sb_version_hasattr2(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
+ XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
+ (xfs_sb_version_hascrc(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_V5SB : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index efbe1accb6ca..7f7be5f98f52 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1638,6 +1638,10 @@ xfs_iunlink(
dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1723,6 +1727,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1796,6 +1804,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1809,6 +1821,10 @@ xfs_iunlink_remove(
last_dip->di_next_unlinked = cpu_to_be32(next_agino);
ASSERT(next_agino != 0);
offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, last_dip);
+
xfs_trans_inode_buf(tp, last_ibp);
xfs_trans_log_buf(tp, last_ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d82efaa2ac73..ca9ecaa81112 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -455,6 +455,28 @@ xfs_vn_getattr(
return 0;
}
+static void
+xfs_setattr_mode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct iattr *iattr)
+{
+ struct inode *inode = VFS_I(ip);
+ umode_t mode = iattr->ia_mode;
+
+ ASSERT(tp);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+
+ ip->i_d.di_mode &= S_IFMT;
+ ip->i_d.di_mode |= mode & ~S_IFMT;
+
+ inode->i_mode &= S_IFMT;
+ inode->i_mode |= mode & ~S_IFMT;
+}
+
int
xfs_setattr_nonsize(
struct xfs_inode *ip,
@@ -606,18 +628,8 @@ xfs_setattr_nonsize(
/*
* Change file access modes.
*/
- if (mask & ATTR_MODE) {
- umode_t mode = iattr->ia_mode;
-
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
-
- ip->i_d.di_mode &= S_IFMT;
- ip->i_d.di_mode |= mode & ~S_IFMT;
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= mode & ~S_IFMT;
- }
+ if (mask & ATTR_MODE)
+ xfs_setattr_mode(tp, ip, iattr);
/*
* Change file access or modified times.
@@ -714,9 +726,8 @@ xfs_setattr_size(
return XFS_ERROR(error);
ASSERT(S_ISREG(ip->i_d.di_mode));
- ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
- ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+ ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
if (!(flags & XFS_ATTR_NOLOCK)) {
lock_flags |= XFS_IOLOCK_EXCL;
@@ -860,6 +871,12 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
+ /*
+ * Change file access modes.
+ */
+ if (mask & ATTR_MODE)
+ xfs_setattr_mode(tp, ip, iattr);
+
if (mask & ATTR_CTIME) {
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index e3d0b85d852b..d0833b54e55d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -139,7 +139,7 @@ xlog_cil_prepare_log_vecs(
new_lv = kmem_zalloc(sizeof(*new_lv) +
niovecs * sizeof(struct xfs_log_iovec),
- KM_SLEEP);
+ KM_SLEEP|KM_NOFS);
/* The allocated iovec region lies beyond the log vector. */
new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 93f03ec17eec..7cf5e4eafe28 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans(
}
/*
- * Sort the log items in the transaction. Cancelled buffers need
- * to be put first so they are processed before any items that might
- * modify the buffers. If they are cancelled, then the modifications
- * don't need to be replayed.
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ * 1. Every item is only logged once in a given transaction. Hence it
+ * represents the last logged state of the item. Hence ordering is
+ * dependent on the order in which operations need to be performed so
+ * required initial conditions are always met.
+ *
+ * 2. Cancelled buffers are recorded in pass 1 in a separate table and
+ * there's nothing to replay from them so we can simply cull them
+ * from the transaction. However, we can't do that until after we've
+ * replayed all the other items because they may be dependent on the
+ * cancelled buffer and replaying the cancelled buffer can remove it
+ * form the cancelled buffer table. Hence they have tobe done last.
+ *
+ * 3. Inode allocation buffers must be replayed before inode items that
+ * read the buffer and replay changes into it.
+ *
+ * 4. Inode unlink buffers must be replayed after inode items are replayed.
+ * This ensures that inodes are completely flushed to the inode buffer
+ * in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ * - buffer_list for all buffers except cancelled/inode unlink buffers
+ * - item_list for all non-buffer items
+ * - inode_buffer_list for inode unlink buffers
+ * - cancel_list for the cancelled buffers
*/
STATIC int
xlog_recover_reorder_trans(
@@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans(
{
xlog_recover_item_t *item, *n;
LIST_HEAD(sort_list);
+ LIST_HEAD(cancel_list);
+ LIST_HEAD(buffer_list);
+ LIST_HEAD(inode_buffer_list);
+ LIST_HEAD(inode_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
@@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans(
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+ if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
trans, item, pass);
- list_move(&item->ri_list, &trans->r_itemq);
+ list_move(&item->ri_list, &cancel_list);
+ break;
+ }
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ list_move(&item->ri_list, &inode_buffer_list);
break;
}
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_INODE:
case XFS_LI_DQUOT:
case XFS_LI_QUOTAOFF:
@@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans(
case XFS_LI_EFI:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
- list_move_tail(&item->ri_list, &trans->r_itemq);
+ list_move_tail(&item->ri_list, &inode_list);
break;
default:
xfs_warn(log->l_mp,
@@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans(
}
}
ASSERT(list_empty(&sort_list));
+ if (!list_empty(&buffer_list))
+ list_splice(&buffer_list, &trans->r_itemq);
+ if (!list_empty(&inode_list))
+ list_splice_tail(&inode_list, &trans->r_itemq);
+ if (!list_empty(&inode_buffer_list))
+ list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+ if (!list_empty(&cancel_list))
+ list_splice_tail(&cancel_list, &trans->r_itemq);
return 0;
}
@@ -1794,7 +1845,13 @@ xlog_recover_do_inode_buffer(
xfs_agino_t *buffer_nextp;
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
- bp->b_ops = &xfs_inode_buf_ops;
+
+ /*
+ * Post recovery validation only works properly on CRC enabled
+ * filesystems.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ bp->b_ops = &xfs_inode_buf_ops;
inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
@@ -1861,6 +1918,15 @@ xlog_recover_do_inode_buffer(
buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
next_unlinked_offset);
*buffer_nextp = *logged_nextp;
+
+ /*
+ * If necessary, recalculate the CRC in the on-disk inode. We
+ * have to leave the inode in a consistent state for whoever
+ * reads it next....
+ */
+ xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
}
return 0;
@@ -2097,6 +2163,17 @@ xlog_recover_do_reg_buffer(
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
/*
+ * The dirty regions logged in the buffer, even though
+ * contiguous, may span multiple chunks. This is because the
+ * dirty region may span a physical page boundary in a buffer
+ * and hence be split into two separate vectors for writing into
+ * the log. Hence we need to trim nbits back to the length of
+ * the current region being copied out of the log.
+ */
+ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+ /*
* Do a sanity check if this is a dquot buffer. Just checking
* the first dquot in the buffer should do. XXXThis is
* probably a good thing to do for other buf types also.
@@ -2134,7 +2211,16 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
- xlog_recovery_validate_buf_type(mp, bp, buf_f);
+ /*
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xlog_recovery_validate_buf_type(mp, bp, buf_f);
}
/*
@@ -2255,6 +2341,12 @@ xfs_qm_dqcheck(
d->dd_diskdq.d_flags = type;
d->dd_diskdq.d_id = cpu_to_be32(id);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
return errs;
}
@@ -2782,6 +2874,10 @@ xlog_recover_dquot_pass2(
}
memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f6bfbd734669..e8e310c05097 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -314,7 +314,8 @@ STATIC int
xfs_mount_validate_sb(
xfs_mount_t *mp,
xfs_sb_t *sbp,
- bool check_inprogress)
+ bool check_inprogress,
+ bool check_version)
{
/*
@@ -337,9 +338,10 @@ xfs_mount_validate_sb(
/*
* Version 5 superblock feature mask validation. Reject combinations the
- * kernel cannot support up front before checking anything else.
+ * kernel cannot support up front before checking anything else. For
+ * write validation, we don't need to check feature masks.
*/
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
xfs_alert(mp,
"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
"Use of these features in this kernel is at your own risk!");
@@ -675,7 +677,8 @@ xfs_sb_to_disk(
static int
xfs_sb_verify(
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ bool check_version)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_sb sb;
@@ -686,7 +689,8 @@ xfs_sb_verify(
* Only check the in progress field for the primary superblock as
* mkfs.xfs doesn't clear it from secondary superblocks.
*/
- return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+ return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+ check_version);
}
/*
@@ -719,7 +723,7 @@ xfs_sb_read_verify(
goto out_error;
}
}
- error = xfs_sb_verify(bp);
+ error = xfs_sb_verify(bp, true);
out_error:
if (error) {
@@ -758,7 +762,7 @@ xfs_sb_write_verify(
struct xfs_buf_log_item *bip = bp->b_fspriv;
int error;
- error = xfs_sb_verify(bp);
+ error = xfs_sb_verify(bp, false);
if (error) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, error);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f41702b43003..b75c9bb6e71e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -41,6 +41,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_cksum.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts(
xfs_dqid_t id,
uint type)
{
- xfs_disk_dquot_t *ddq;
+ struct xfs_dqblk *dqb;
int j;
trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts(
do_div(j, sizeof(xfs_dqblk_t));
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
- ddq = bp->b_addr;
+ dqb = bp->b_addr;
for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+ struct xfs_disk_dquot *ddq;
+
+ ddq = (struct xfs_disk_dquot *)&dqb[j];
+
/*
* Do a sanity check, and if needed, repair the dqblk. Don't
* output any warnings because it's perfectly possible to
@@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts(
ddq->d_bwarns = 0;
ddq->d_iwarns = 0;
ddq->d_rtbwarns = 0;
- ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)&dqb[j],
+ sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
}
}
@@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs(
XFS_FSB_TO_DADDR(mp, bno),
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
- if (error)
- break;
/*
- * XXX(hch): need to figure out if it makes sense to validate
- * the CRC here.
+ * CRC and validation errors will return a EFSCORRUPTED here. If
+ * this occurs, re-read without CRC validation so that we can
+ * repair the damage via xfs_qm_reset_dqcounts(). This process
+ * will leave a trace in the log indicating corruption has
+ * been detected.
*/
+ if (error == EFSCORRUPTED) {
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, bno),
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ NULL);
+ }
+
+ if (error)
+ break;
+
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
- /*
- * goto the next block.
- */
+
+ /* goto the next block. */
bno++;
firstid += mp->m_quotainfo->qi_dqperchunk;
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index c41190cad6e9..6cdf6ffc36a1 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -489,31 +489,36 @@ xfs_qm_scall_setqlim(
if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
return 0;
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
- error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
- 0, 0, XFS_DEFAULT_LOG_COUNT);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return (error);
- }
-
/*
* We don't want to race with a quotaoff so take the quotaoff lock.
- * (We don't hold an inode lock, so there's nothing else to stop
- * a quotaoff from happening). (XXXThis doesn't currently happen
- * because we take the vfslock before calling xfs_qm_sysent).
+ * We don't hold an inode lock, so there's nothing else to stop
+ * a quotaoff from happening.
*/
mutex_lock(&q->qi_quotaofflock);
/*
- * Get the dquot (locked), and join it to the transaction.
- * Allocate the dquot if this doesn't exist.
+ * Get the dquot (locked) before we start, as we need to do a
+ * transaction to allocate it if it doesn't exist. Once we have the
+ * dquot, unlock it so we can start the next transaction safely. We hold
+ * a reference to the dquot, so it's safe to do this unlock/lock without
+ * it being reclaimed in the mean time.
*/
- if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+ if (error) {
ASSERT(error != ENOENT);
goto out_unlock;
}
+ xfs_dqunlock(dqp);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+ error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
+ 0, 0, XFS_DEFAULT_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_rele;
+ }
+
+ xfs_dqlock(dqp);
xfs_trans_dqjoin(tp, dqp);
ddq = &dqp->q_core;
@@ -621,9 +626,10 @@ xfs_qm_scall_setqlim(
xfs_trans_log_dquot(tp, dqp);
error = xfs_trans_commit(tp, 0);
- xfs_qm_dqrele(dqp);
- out_unlock:
+out_rele:
+ xfs_qm_dqrele(dqp);
+out_unlock:
mutex_unlock(&q->qi_quotaofflock);
return error;
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c61e31c7d997..c38068f26c55 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -87,6 +87,8 @@ typedef struct xfs_dqblk {
uuid_t dd_uuid; /* location information */
} xfs_dqblk_t;
+#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
+
/*
* flags for q_flags field in the dquot.
*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ea341cea68cb..3033ba5e9762 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1373,6 +1373,17 @@ xfs_finish_flags(
}
/*
+ * V5 filesystems always use attr2 format for attributes.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+ xfs_warn(mp,
+"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
+ MNTOPT_NOATTR2, MNTOPT_ATTR2);
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
* mkfs'ed attr2 will turn on attr2 mount unless explicitly
* told by noattr2 to turn it off
*/
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 5f234389327c..195a403e1522 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -56,16 +56,9 @@ xfs_symlink_blocks(
struct xfs_mount *mp,
int pathlen)
{
- int fsblocks = 0;
- int len = pathlen;
+ int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
- do {
- fsblocks++;
- len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
- } while (len > 0);
-
- ASSERT(fsblocks <= XFS_SYMLINK_MAPS);
- return fsblocks;
+ return (pathlen + buflen - 1) / buflen;
}
static int
@@ -405,7 +398,7 @@ xfs_symlink(
if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
fs_blocks = 0;
else
- fs_blocks = XFS_B_TO_FSB(mp, pathlen);
+ fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
@@ -512,7 +505,7 @@ xfs_symlink(
cur_chunk = target_path;
offset = 0;
for (n = 0; n < nmaps; n++) {
- char *buf;
+ char *buf;
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
@@ -525,9 +518,7 @@ xfs_symlink(
bp->b_ops = &xfs_symlink_buf_ops;
byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
- if (pathlen < byte_cnt) {
- byte_cnt = pathlen;
- }
+ byte_cnt = min(byte_cnt, pathlen);
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
@@ -542,6 +533,7 @@ xfs_symlink(
xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
(char *)bp->b_addr);
}
+ ASSERT(pathlen == 0);
}
/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..a04701de6bbd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -974,14 +974,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
DEFINE_RW_EVENT(xfs_file_splice_write);
DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
- TP_ARGS(inode, page, off),
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+ unsigned int len),
+ TP_ARGS(inode, page, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
+ __field(unsigned int, length)
__field(int, delalloc)
__field(int, unwritten)
),
@@ -995,24 +997,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
__entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
+ __entry->length = len;
__entry->delalloc = delalloc;
__entry->unwritten = unwritten;
),
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "delalloc %d unwritten %d",
+ "length %x delalloc %d unwritten %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pgoff,
__entry->size,
__entry->offset,
+ __entry->length,
__entry->delalloc,
__entry->unwritten)
)
#define DEFINE_PAGE_EVENT(name) \
DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
- TP_ARGS(inode, page, off))
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+ unsigned int len), \
+ TP_ARGS(inode, page, off, len))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 1501f4fa51a6..0176bb21f09a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1453,7 +1453,7 @@ xfs_free_file_space(
xfs_mount_t *mp;
int nimap;
uint resblks;
- uint rounding;
+ xfs_off_t rounding;
int rt;
xfs_fileoff_t startoffset_fsb;
xfs_trans_t *tp;
@@ -1482,7 +1482,7 @@ xfs_free_file_space(
inode_dio_wait(VFS_I(ip));
}
- rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
ioffset = offset & ~(rounding - 1);
error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ioffset, -1);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d9808..38c67c34d73f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
- xfs_off_t *offset, filldir_t filldir);
+int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
OpenPOWER on IntegriCloud