summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c47
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/binfmt_elf.c44
-rw-r--r--fs/binfmt_elf_fdpic.c56
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/disk-io.c3
-rw-r--r--fs/btrfs/inode.c28
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/exec.c6
-rw-r--r--fs/ext2/xip.c2
-rw-r--r--fs/ext3/fsync.c12
-rw-r--r--fs/ext3/inode.c28
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c112
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c150
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c429
-rw-r--r--fs/ext4/mballoc.h22
-rw-r--r--fs/ext4/migrate.c22
-rw-r--r--fs/ext4/move_extent.c334
-rw-r--r--fs/ext4/namei.c22
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c159
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/fs-writeback.c345
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/fuse_i.h18
-rw-r--r--fs/fuse/inode.c82
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/inode.c20
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c30
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c9
-rw-r--r--fs/jbd2/commit.c12
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/lockd/clntproc.c2
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/locks.c2
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c1
-rw-r--r--fs/nfsd/nfs4state.c4
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/mdt.c4
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h10
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/omfs/file.c4
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/omfs/omfs.h6
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/proc/base.c44
-rw-r--r--fs/proc/kcore.c35
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/proc/task_mmu.c28
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/super.c8
-rw-r--r--fs/sync.c9
-rw-r--r--fs/ubifs/budget.c22
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c112
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/file.c62
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c29
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/key.h35
-rw-r--r--fs/ubifs/log.c17
-rw-r--r--fs/ubifs/lprops.c43
-rw-r--r--fs/ubifs/master.c20
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c4
-rw-r--r--fs/ubifs/replay.c6
-rw-r--r--fs/ubifs/scan.c32
-rw-r--r--fs/ubifs/super.c34
-rw-r--r--fs/ubifs/tnc.c76
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c28
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c42
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_ialloc.c805
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c27
-rw-r--r--fs/xfs/xfs_inode.c8
-rw-r--r--fs/xfs/xfs_inode.h8
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_itable.c98
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c17
157 files changed, 2540 insertions, 2207 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 455aa207e67e..d4bf8caad8d0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,6 +109,7 @@ source "fs/sysfs/Kconfig"
config TMPFS
bool "Virtual memory file system support (former shm fs)"
+ depends on SHMEM
help
Tmpfs is a file system which keeps all files in virtual memory.
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3ff8bdd18fb3..0931bc1325eb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl);
static struct workqueue_struct *afs_lock_manager;
static DEFINE_MUTEX(afs_lock_manager_mutex);
-static struct file_lock_operations afs_lock_ops = {
+static const struct file_lock_operations afs_lock_ops = {
.fl_copy_lock = afs_fl_copy_lock,
.fl_release_private = afs_fl_release_private,
};
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
.bdi = mapping->backing_dev_info,
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
- .for_writepages = 1,
.range_cyclic = 1,
};
int ret;
diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..fc21c23b2387 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -24,6 +24,7 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/mmu_context.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
@@ -34,7 +35,6 @@
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
#if DEBUG > 1
#define dprintk printk
@@ -595,51 +595,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
}
/*
- * use_mm
- * Makes the calling kernel thread take on the specified
- * mm context.
- * Called by the retry thread execute retries within the
- * iocb issuer's mm context, so that copy_from/to_user
- * operations work seamlessly for aio.
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-static void use_mm(struct mm_struct *mm)
-{
- struct mm_struct *active_mm;
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- active_mm = tsk->active_mm;
- atomic_inc(&mm->mm_count);
- tsk->mm = mm;
- tsk->active_mm = mm;
- switch_mm(active_mm, mm, tsk);
- task_unlock(tsk);
-
- mmdrop(active_mm);
-}
-
-/*
- * unuse_mm
- * Reverses the effect of use_mm, i.e. releases the
- * specified mm context which was earlier taken on
- * by the calling kernel thread
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-static void unuse_mm(struct mm_struct *mm)
-{
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- tsk->mm = NULL;
- /* active_mm is still 'mm' */
- enter_lazy_tlb(mm, tsk);
- task_unlock(tsk);
-}
-
-/*
* Queue up a kiocb to be retried. Assumes that the kiocb
* has already been marked as kicked, and places it on
* the retry run list for the corresponding ioctx, if it
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 615d5496fe0f..dd376c124e71 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -842,7 +842,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = BEFS_SUPER_MAGIC;
/* Set real blocksize of fs */
sb_set_blocksize(sb, (ulong) befs_sb->block_size);
- sb->s_op = (struct super_operations *) &befs_sops;
+ sb->s_op = &befs_sops;
root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
if (IS_ERR(root)) {
ret = PTR_ERR(root);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7c1e65d54872..442d94fe255c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file,
#define DUMP_WRITE(addr, nr) \
if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(off) \
- if (!dump_seek(file, (off))) \
- goto end_coredump;
static void fill_elf_header(struct elfhdr *elf, int segs,
u16 machine, u32 flags, u8 osabi)
@@ -2016,7 +2013,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
goto end_coredump;
/* Align to page */
- DUMP_SEEK(dataoff - foffset);
+ if (!dump_seek(file, dataoff - foffset))
+ goto end_coredump;
for (vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
@@ -2027,33 +2025,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
struct page *page;
- struct vm_area_struct *tmp_vma;
-
- if (get_user_pages(current, current->mm, addr, 1, 0, 1,
- &page, &tmp_vma) <= 0) {
- DUMP_SEEK(PAGE_SIZE);
- } else {
- if (page == ZERO_PAGE(0)) {
- if (!dump_seek(file, PAGE_SIZE)) {
- page_cache_release(page);
- goto end_coredump;
- }
- } else {
- void *kaddr;
- flush_cache_page(tmp_vma, addr,
- page_to_pfn(page));
- kaddr = kmap(page);
- if ((size += PAGE_SIZE) > limit ||
- !dump_write(file, kaddr,
- PAGE_SIZE)) {
- kunmap(page);
- page_cache_release(page);
- goto end_coredump;
- }
- kunmap(page);
- }
+ int stop;
+
+ page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+ stop = ((size += PAGE_SIZE) > limit) ||
+ !dump_write(file, kaddr, PAGE_SIZE);
+ kunmap(page);
page_cache_release(page);
- }
+ } else
+ stop = !dump_seek(file, PAGE_SIZE);
+ if (stop)
+ goto end_coredump;
}
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 20fbeced472b..76285471073e 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1325,9 +1325,6 @@ static int writenote(struct memelfnote *men, struct file *file)
#define DUMP_WRITE(addr, nr) \
if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(off) \
- if (!dump_seek(file, (off))) \
- goto end_coredump;
static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
{
@@ -1518,6 +1515,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
unsigned long *limit, unsigned long mm_flags)
{
struct vm_area_struct *vma;
+ int err = 0;
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
unsigned long addr;
@@ -1525,43 +1523,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
if (!maydump(vma, mm_flags))
continue;
- for (addr = vma->vm_start;
- addr < vma->vm_end;
- addr += PAGE_SIZE
- ) {
- struct vm_area_struct *vma;
- struct page *page;
-
- if (get_user_pages(current, current->mm, addr, 1, 0, 1,
- &page, &vma) <= 0) {
- DUMP_SEEK(file->f_pos + PAGE_SIZE);
- }
- else if (page == ZERO_PAGE(0)) {
- page_cache_release(page);
- DUMP_SEEK(file->f_pos + PAGE_SIZE);
- }
- else {
- void *kaddr;
-
- flush_cache_page(vma, addr, page_to_pfn(page));
- kaddr = kmap(page);
- if ((*size += PAGE_SIZE) > *limit ||
- !dump_write(file, kaddr, PAGE_SIZE)
- ) {
- kunmap(page);
- page_cache_release(page);
- return -EIO;
- }
+ for (addr = vma->vm_start; addr < vma->vm_end;
+ addr += PAGE_SIZE) {
+ struct page *page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+ *size += PAGE_SIZE;
+ if (*size > *limit)
+ err = -EFBIG;
+ else if (!dump_write(file, kaddr, PAGE_SIZE))
+ err = -EIO;
kunmap(page);
page_cache_release(page);
- }
+ } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
+ err = -EFBIG;
+ if (err)
+ goto out;
}
}
-
- return 0;
-
-end_coredump:
- return -EFBIG;
+out:
+ return err;
}
#endif
@@ -1802,7 +1783,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
goto end_coredump;
}
- DUMP_SEEK(dataoff);
+ if (!dump_seek(file, dataoff))
+ goto end_coredump;
if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
goto end_coredump;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3581a4e53942..5d1ed50bd46c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
{
struct bdev_inode *bdi = BDEV_I(inode);
- bdi->bdev.bd_inode_backing_dev_info = NULL;
kmem_cache_free(bdev_cachep, bdi);
}
@@ -1115,7 +1114,7 @@ EXPORT_SYMBOL(revalidate_disk);
int check_disk_change(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
- struct block_device_operations * bdops = disk->fops;
+ const struct block_device_operations *bdops = disk->fops;
if (!bdops->media_changed)
return 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15831d5c7367..6c4173146bb7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -772,7 +772,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
-static struct address_space_operations btree_aops = {
+static const struct address_space_operations btree_aops = {
.readpage = btree_readpage,
.writepage = btree_writepage,
.writepages = btree_writepages,
@@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
/*
* we set the i_size on the btree inode to the max possible int.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59cba180fe83..9096fd0ca3ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -55,13 +55,13 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
-static struct inode_operations btrfs_dir_inode_operations;
-static struct inode_operations btrfs_symlink_inode_operations;
-static struct inode_operations btrfs_dir_ro_inode_operations;
-static struct inode_operations btrfs_special_inode_operations;
-static struct inode_operations btrfs_file_inode_operations;
-static struct address_space_operations btrfs_aops;
-static struct address_space_operations btrfs_symlink_aops;
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
static struct file_operations btrfs_dir_file_operations;
static struct extent_io_ops btrfs_extent_io_ops;
@@ -5201,7 +5201,7 @@ static int btrfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask, btrfs_check_acl);
}
-static struct inode_operations btrfs_dir_inode_operations = {
+static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
.create = btrfs_create,
@@ -5219,7 +5219,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
};
-static struct inode_operations btrfs_dir_ro_inode_operations = {
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
};
@@ -5259,7 +5259,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
*
* For now we're avoiding this by dropping bmap.
*/
-static struct address_space_operations btrfs_aops = {
+static const struct address_space_operations btrfs_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
@@ -5271,14 +5271,14 @@ static struct address_space_operations btrfs_aops = {
.set_page_dirty = btrfs_set_page_dirty,
};
-static struct address_space_operations btrfs_symlink_aops = {
+static const struct address_space_operations btrfs_symlink_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
};
-static struct inode_operations btrfs_file_inode_operations = {
+static const struct inode_operations btrfs_file_inode_operations = {
.truncate = btrfs_truncate,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
@@ -5290,7 +5290,7 @@ static struct inode_operations btrfs_file_inode_operations = {
.fallocate = btrfs_fallocate,
.fiemap = btrfs_fiemap,
};
-static struct inode_operations btrfs_special_inode_operations = {
+static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
@@ -5299,7 +5299,7 @@ static struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
-static struct inode_operations btrfs_symlink_inode_operations = {
+static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
.nr_to_write = mapping->nrpages * 2,
.range_start = start,
.range_end = end,
- .for_writepages = 1,
};
return btrfs_writepages(mapping, &wbc);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6d6d06cb6dfc..2db17cd66fc5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,7 @@
#include "export.h"
#include "compression.h"
-static struct super_operations btrfs_super_ops;
+static const struct super_operations btrfs_super_ops;
static void btrfs_put_super(struct super_block *sb)
{
@@ -675,7 +675,7 @@ static int btrfs_unfreeze(struct super_block *sb)
return 0;
}
-static struct super_operations btrfs_super_ops = {
+static const struct super_operations btrfs_super_ops = {
.delete_inode = btrfs_delete_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606912d8f2a8..6889c0d07b82 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -385,7 +385,7 @@ out_err:
goto out;
}
-struct inode_operations cifs_dfs_referral_inode_operations = {
+const struct inode_operations cifs_dfs_referral_inode_operations = {
.follow_link = cifs_dfs_follow_mountpoint,
};
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3610e9958b4c..d79ce2e95c23 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -50,7 +50,7 @@
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
#ifdef CONFIG_CIFS_QUOTA
-static struct quotactl_ops cifs_quotactl_ops;
+static const struct quotactl_ops cifs_quotactl_ops;
#endif /* QUOTA */
int cifsFYI = 0;
@@ -517,7 +517,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
return rc;
}
-static struct quotactl_ops cifs_quotactl_ops = {
+static const struct quotactl_ops cifs_quotactl_ops = {
.set_xquota = cifs_xquota_set,
.get_xquota = cifs_xquota_get,
.set_xstate = cifs_xstate_set,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 094325e3f714..ac2b24c192f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
extern const struct inode_operations cifs_file_inode_ops;
extern const struct inode_operations cifs_symlink_inode_ops;
-extern struct inode_operations cifs_dfs_referral_inode_operations;
+extern const struct inode_operations cifs_dfs_referral_inode_operations;
/* Functions related to files and directories */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
#define CF_CONNECT_PENDING 3
#define CF_INIT_PENDING 4
#define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
static inline void lowcomms_connect_sock(struct connection *con)
{
+ if (test_bit(CF_CLOSE, &con->flags))
+ return;
if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
- sock_release(sock);
+ if (dlm_nodeid_to_addr(con->nodeid, &saddr))
goto out_err;
- }
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
static void send_to_sock(struct connection *con)
{
int ret = 0;
- ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
if (con->sock == NULL)
goto out_connect;
- sendpage = con->sock->ops->sendpage;
-
spin_lock(&con->writequeue_lock);
for (;;) {
e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
ret = 0;
if (len) {
- ret = sendpage(con->sock, e->page, offset, len,
- msg_flags);
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
if (ret == -EAGAIN || ret == 0) {
cond_resched();
goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
log_print("closing connection to node %d", nodeid);
con = nodeid2con(nodeid, 0);
if (con) {
+ clear_bit(CF_CONNECT_PENDING, &con->flags);
+ clear_bit(CF_WRITE_PENDING, &con->flags);
+ set_bit(CF_CLOSE, &con->flags);
+ if (cancel_work_sync(&con->swork))
+ log_print("canceled swork for node %d", nodeid);
+ if (cancel_work_sync(&con->rwork))
+ log_print("canceled rwork for node %d", nodeid);
clean_one_writequeue(con);
close_connection(con, true);
}
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
con->connect_action(con);
+ set_bit(CF_WRITE_PENDING, &con->flags);
}
- clear_bit(CF_WRITE_PENDING, &con->flags);
- send_to_sock(con);
+ if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+ send_to_sock(con);
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 00b30a2d5466..542f625312f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops;
extern const struct inode_operations ecryptfs_symlink_iops;
extern const struct super_operations ecryptfs_sops;
extern const struct dentry_operations ecryptfs_dops;
-extern struct address_space_operations ecryptfs_aops;
+extern const struct address_space_operations ecryptfs_aops;
extern int ecryptfs_verbosity;
extern unsigned int ecryptfs_message_buf_len;
extern signed long ecryptfs_message_wait_timeout;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 5c6bab9786e3..05772aeaa8f4 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -545,7 +545,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
return rc;
}
-struct address_space_operations ecryptfs_aops = {
+const struct address_space_operations ecryptfs_aops = {
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
.write_begin = ecryptfs_write_begin,
diff --git a/fs/exec.c b/fs/exec.c
index 172ceb6edde4..434dba778ccc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/pagemap.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
@@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
task_lock(tsk);
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
- perf_counter_comm(tsk);
+ perf_event_comm(tsk);
}
int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm)
* security domain:
*/
if (!get_dumpable(current->mm))
- perf_counter_exit_task(current);
+ perf_event_exit_task(current);
/* An exec changes our domain. We are no longer part of the thread
group */
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index b72b85884223..c18fbf3e4068 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block,
void **kaddr, unsigned long *pfn)
{
struct block_device *bdev = inode->i_sb->s_bdev;
- struct block_device_operations *ops = bdev->bd_disk->fops;
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
sector_t sector;
sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
*/
#include <linux/time.h>
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
}
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
+ goto flush;
/*
* The VFS has written the file data. If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ goto out;
}
+flush:
+ /*
+ * In case we didn't commit a transaction, we have to flush
+ * disk caches manually so that data really is on persistent
+ * storage
+ */
+ if (test_opt(inode->i_sb, BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
out:
return ret;
}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b49908a167ae..cd098a7b77fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
{
+ int ret;
+
jbd_debug(2, "restarting handle %p\n", handle);
- return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ /*
+ * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+ * At this moment, get_block can be called only for blocks inside
+ * i_size since page cache has been already dropped and writes are
+ * blocked by i_mutex. So we can safely drop the truncate_mutex.
+ */
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+ ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ return ret;
}
/*
@@ -2072,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
ext3_journal_dirty_metadata(handle, bh);
}
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext3_journal_get_write_access(handle, bh);
@@ -2282,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
}
ext3_free_blocks(handle, inode, nr, 1);
@@ -2892,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+again:
+ /* we can't allow multiple procs in here at once, its a bit racey */
+ lock_buffer(bh);
+
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ei->i_state & EXT3_STATE_NEW)
@@ -2951,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
/* If this is the first large file
* created, add a flag to the superblock.
*/
+ unlock_buffer(bh);
err = ext3_journal_get_write_access(handle,
EXT3_SB(sb)->s_sbh);
if (err)
goto out_brelse;
+
ext3_update_dynamic_rev(sb);
EXT3_SET_RO_COMPAT_FEATURE(sb,
EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
handle->h_sync = 1;
err = ext3_journal_dirty_metadata(handle,
EXT3_SB(sb)->s_sbh);
+ /* get our lock and start over */
+ goto again;
}
}
}
@@ -2983,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ unlock_buffer(bh);
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a8d80a7f1105..72743d360509 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -720,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
static ssize_t ext3_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
-static struct dquot_operations ext3_quota_operations = {
+static const struct dquot_operations ext3_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -737,7 +737,7 @@ static struct dquot_operations ext3_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext3_qctl_operations = {
+static const struct quotactl_ops ext3_qctl_operations = {
.quota_on = ext3_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
To enable backwards compatibility so that systems that are
still expecting to mount ext4 filesystems using ext4dev,
- chose Y here. This feature will go away by 2.6.31, so
+ choose Y here. This feature will go away by 2.6.31, so
please arrange to get your userspace programs fixed!
config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+ bool "EXT4 debugging support"
+ depends on EXT4_FS
+ help
+ Enables run-time debugging support for the ext4 filesystem.
+
+ If you select Y here, then you will be able to turn on debugging
+ with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
* new bitmap information
*/
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- ext4_mb_update_group_info(grp, blocks_freed);
+ grp->bb_free += blocks_freed;
up_write(&grp->alloc_sem);
/* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
/* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE 1
+#define EXT4_MB_HINT_MERGE 0x0001
/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 2
+#define EXT4_MB_HINT_RESERVED 0x0002
/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 4
+#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
-#define EXT4_MB_HINT_FIRST 8
+#define EXT4_MB_HINT_FIRST 0x0008
/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 16
+#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
-#define EXT4_MB_HINT_DATA 32
+#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC 64
+#define EXT4_MB_HINT_NOPREALLOC 0x0040
/* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC 128
+#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
/* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY 256
+#define EXT4_MB_HINT_GOAL_ONLY 0x0100
/* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL 512
+#define EXT4_MB_HINT_TRY_GOAL 0x0200
/* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED 1024
+#define EXT4_MB_DELALLOC_RESERVED 0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC 0x0800
struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
};
/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+ struct inode *inode;
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
+ unsigned long first_page, next_page; /* extent of pages */
+ struct writeback_control *wbc;
+ int io_done;
+ int pages_written;
+ int retval;
+};
+
+/*
* Special inodes numbers
*/
#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
#endif
};
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+
/*
* Structure of an inode on the disk
*/
@@ -456,7 +476,6 @@ struct move_extent {
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};
-#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
-#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
unsigned long s_gdb_count; /* Number of group descriptor blocks */
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
+ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
unsigned long s_overhead_last; /* Last calculated overhead */
unsigned long s_blocks_last; /* Last seen block count */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
+ atomic_t s_lock_busy;
/* locality groups */
struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
ext4_fsblk_t, unsigned long, int, unsigned long *);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
- ext4_grpblk_t add);
extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
- unsigned short bb_first_free;
- unsigned short bb_free;
- unsigned short bb_fragments;
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- unsigned short bb_counters[];
+ ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
+ * regions, index is order.
+ * bb_counters[3] = 5 means
+ * 5 free 8-block regions. */
};
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MAX_CONTENTION 8
+#define EXT4_CONTENTION_THRESHOLD 2
+
static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
ext4_group_t group)
{
return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spin_lock(ext4_group_lock_ptr(sb, group));
+ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+ if (spin_trylock(lock))
+ /*
+ * We're able to grab the lock right away, so drop the
+ * lock contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ else {
+ /*
+ * The lock is busy, so bump the contention counter,
+ * and then wait on the spin lock.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+ EXT4_MAX_CONTENTION);
+ spin_lock(lock);
+ }
}
static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
- * to get lots of info about what's going on.
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
*/
#define EXT_DEBUG__
#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
#define EXT_BREAK 1
#define EXT_REPEAT 2
+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
#define EXT_MAX_BLOCK 0xffffffff
/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- mark_buffer_dirty(bh);
+ if (inode && bh)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+ struct inode *inode,
+ int needed)
{
int err;
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
err = ext4_journal_extend(handle, needed);
if (err <= 0)
return err;
- return ext4_journal_restart(handle, needed);
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+ /*
+ * We have dropped i_data_sem so someone might have cached again
+ * an extent we are going to truncate.
+ */
+ ext4_ext_invalidate_cache(inode);
+
+ return err;
}
/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
return newblock;
}
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 6)
- size = 6;
+ if (size > 6)
+ size = 6;
#endif
+ }
return size;
}
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 5)
- size = 5;
+ if (size > 5)
+ size = 5;
#endif
+ }
return size;
}
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 3)
- size = 3;
+ if (size > 3)
+ size = 3;
#endif
+ }
return size;
}
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 4)
- size = 4;
+ if (size > 4)
+ size = 4;
#endif
+ }
return size;
}
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
int lcap, icap, rcap, leafs, idxs, num;
int newextents = blocks;
- rcap = ext4_ext_space_root_idx(inode);
- lcap = ext4_ext_space_block(inode);
- icap = ext4_ext_space_block_idx(inode);
+ rcap = ext4_ext_space_root_idx(inode, 0);
+ lcap = ext4_ext_space_block(inode, 0);
+ icap = ext4_ext_space_block_idx(inode, 0);
/* number of new leaf blocks needed */
num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
if (depth == ext_depth(inode)) {
if (depth == 0)
- max = ext4_ext_space_root(inode);
+ max = ext4_ext_space_root(inode, 1);
else
- max = ext4_ext_space_root_idx(inode);
+ max = ext4_ext_space_root_idx(inode, 1);
} else {
if (depth == 0)
- max = ext4_ext_space_block(inode);
+ max = ext4_ext_space_block(inode, 1);
else
- max = ext4_ext_space_block_idx(inode);
+ max = ext4_ext_space_block_idx(inode, 1);
}
return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
idx_pblock(path->p_idx));
} else if (path->p_ext) {
- ext_debug(" %d:%d:%llu ",
+ ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
+ ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
- ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
}
path->p_ext = l - 1;
- ext_debug(" -> %d:%llu:%d ",
+ ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_depth = 0;
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
- eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
ext4_ext_invalidate_cache(inode);
return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = 0;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
path[depth].p_ext++;
while (path[depth].p_ext <=
EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:%d in new leaf %llu\n",
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
+ ext4_ext_is_uninitialized(path[depth].p_ext),
ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = cpu_to_le16(1);
neh->eh_magic = EXT4_EXT_MAGIC;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
/* old root could have indexes or leaves
* so calculate e_max right way */
if (ext_depth(inode))
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
else
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
goto out;
curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
- curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+ curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
curp->p_hdr->eh_entries = cpu_to_le16(1);
curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append %d block to %d:%d (from %llu)\n",
+ ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -1651,9 +1676,10 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:%d\n",
+ ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
len = EXT_MAX_EXTENT(eh) - nearex;
len = (len - 1) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
BUG_ON(newext->ee_block == nearex->ee_block);
len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
else
uninitialized = 0;
- ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+ ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ uninitialized, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
}
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
- err = ext4_ext_journal_restart(handle, credits);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
goto out;
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (err == 0) {
ext_inode_hdr(inode)->eh_depth = 0;
ext_inode_hdr(inode)->eh_max =
- cpu_to_le16(ext4_ext_space_root(inode));
+ cpu_to_le16(ext4_ext_space_root(inode, 0));
err = ext4_ext_dirty(handle, inode, path);
}
}
@@ -2743,6 +2772,7 @@ insert:
} else if (err)
goto fix_extent_len;
out:
+ ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
struct ext4_allocation_request ar;
__clear_bit(BH_New, &bh_result->b_state);
- ext_debug("blocks %u/%u requested for inode %u\n",
+ ext_debug("blocks %u/%u requested for inode %lu\n",
iblock, max_blocks, inode->i_ino);
/* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = iblock - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (iblock - ee_block);
- ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
+ ext_debug("%u fit into %u:%d -> %llu\n", iblock,
ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
- ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+ ext_debug("allocate new block: goal %llu, found %llu/%u\n",
ar.goal, newblock, allocated);
/* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret = 0;
+ int err, ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
goto out;
}
+ if (!journal)
+ ret = sync_mapping_buffers(inode->i_mapping);
+
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* sys_fsync did this */
};
- ret = sync_inode(inode, &wbc);
- if (journal && (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ err = sync_inode(inode, &wbc);
+ if (ret == 0)
+ ret = err;
}
out:
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, ext4_free_inodes_count(sb, gdp), x);
+ (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..4abd683b963d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
- return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ return ret;
}
/*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
int n = 0;
int final = 0;
- if (i_block < 0) {
- ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
- } else if (i_block < direct_blocks) {
+ if (i_block < direct_blocks) {
offsets[n++] = i_block;
final = direct_blocks;
} else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
*
* Normally this function find the preferred place for block allocation,
* returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
*/
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
Indirect *partial)
{
+ ext4_fsblk_t goal;
+
/*
* XXX need to get goal block from mballoc's data structures
*/
- return ext4_find_near(inode, partial);
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
}
/**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (*err)
goto failed_out;
+ BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+
target -= count;
/* allocate blocks for indirect blocks */
while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
+ BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
if (*err && (target == blks)) {
/*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
BUFFER_TRACE(bh, "call get_create_access");
err = ext4_journal_get_create_access(handle, bh);
if (err) {
+ /* Don't brelse(bh) here; it's done in
+ * ext4_journal_forget() below */
unlock_buffer(bh);
- brelse(bh);
goto failed;
}
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
ext4_discard_preallocations(inode);
}
-static int check_block_validity(struct inode *inode, sector_t logical,
- sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *msg,
+ sector_t logical, sector_t phys, int len)
{
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- ext4_error(inode->i_sb, "check_block_validity",
+ ext4_error(inode->i_sb, msg,
"inode #%lu logical block %llu mapped to %llu "
"(size %d)", inode->i_ino,
(unsigned long long) logical,
(unsigned long long) phys, len);
- WARN_ON(1);
return -EIO;
}
return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system corruption",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
}
}
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system "
+ "corruption after allocation",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
/*
* mpage_da_submit_io - walks through extent of pages and try to write
* them with writepage() call back
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
long pages_skipped;
int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
mpd.io_done = 1;
ret = MPAGE_DA_EXTENT_TAIL;
}
+ trace_ext4_da_write_pages(inode, &mpd);
wbc->nr_to_write -= mpd.pages_written;
ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
}
@@ -3117,6 +3128,8 @@ out:
*/
int ext4_alloc_da_blocks(struct inode *inode)
{
+ trace_ext4_alloc_da_blocks(inode);
+
if (!EXT4_I(inode)->i_reserved_data_blocks &&
!EXT4_I(inode)->i_reserved_meta_blocks)
return 0;
@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
ext4_handle_dirty_metadata(handle, inode, bh);
}
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
}
ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- if (ei->i_disksize && inode->i_size == 0 &&
- !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
- struct ext4_iloc *iloc)
+ struct ext4_iloc *iloc,
+ int do_sync)
{
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_inode_blocks_set(handle, raw_inode, ei))
goto out_brelse;
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- /* clear the migrate flag in the raw_inode */
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags);
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_HURD))
raw_inode->i_file_acl_high =
@@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!err)
- err = rc;
+ /*
+ * If we're not using a journal and we were called from
+ * ext4_write_inode() to sync the inode (making do_sync true),
+ * we can just use sync_dirty_buffer() directly to do our dirty
+ * work. Testing s_journal here is a bit redundant but it's
+ * worth it to avoid potential future trouble.
+ */
+ if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+ BUFFER_TRACE(bh, "call sync_dirty_buffer");
+ sync_dirty_buffer(bh);
+ } else {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!err)
+ err = rc;
+ }
ei->i_state &= ~EXT4_STATE_NEW;
out_brelse:
@@ -4684,19 +4710,32 @@ out_brelse:
*/
int ext4_write_inode(struct inode *inode, int wait)
{
+ int err;
+
if (current->flags & PF_MEMALLOC)
return 0;
- if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
- }
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
- if (!wait)
- return 0;
+ if (!wait)
+ return 0;
+
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
- return ext4_force_commit(inode->i_sb);
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ return err;
+ err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+ inode, &iloc, wait);
+ }
+ return err;
}
/*
@@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
get_bh(iloc->bh);
/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
- err = ext4_do_update_inode(handle, inode, iloc);
+ err = ext4_do_update_inode(handle, inode, iloc, 0);
put_bh(iloc->bh);
return err;
}
@@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
else
len = PAGE_CACHE_SIZE;
+ lock_page(page);
+ /*
+ * return if we have all the buffers mapped. This avoid
+ * the need to call write_begin/write_end which does a
+ * journal_start/journal_stop which can block and take
+ * long time
+ */
if (page_has_buffers(page)) {
- /* return if we have all the buffers mapped */
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped))
+ ext4_bh_unmapped)) {
+ unlock_page(page);
goto out_unlock;
+ }
}
+ unlock_page(page);
/*
* OK, we need to fill the hole... Do write_begin write_end
* to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
me.donor_start, me.len, &me.moved_len);
fput(donor_filp);
- if (!err)
- if (copy_to_user((struct move_extent *)arg,
- &me, sizeof(me)))
- return -EFAULT;
+ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+ return -EFAULT;
+
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
*/
#include "mballoc.h"
+#include <linux/debugfs.h>
#include <trace/events/ext4.h>
/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
/* FIXME!! need more doc */
static void ext4_mb_mark_free_simple(struct super_block *sb,
- void *buddy, unsigned first, int len,
+ void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned short min;
- unsigned short max;
- unsigned short chunk;
+ ext4_grpblk_t min;
+ ext4_grpblk_t max;
+ ext4_grpblk_t chunk;
unsigned short border;
BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
- unsigned short i = 0;
- unsigned short first;
- unsigned short len;
+ ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_grpblk_t first;
+ ext4_grpblk_t len;
unsigned free = 0;
unsigned fragments = 0;
unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
char *data;
char *bitmap;
- mb_debug("init page %lu\n", page->index);
+ mb_debug(1, "init page %lu\n", page->index);
inode = page->mapping->host;
sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
set_bitmap_uptodate(bh[i]);
bh[i]->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh[i]);
- mb_debug("read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", first_group + i);
}
/* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug("put buddy for group %u in page %lu/%x\n",
+ mb_debug(1, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
- sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ sizeof(*grinfo->bb_counters) *
+ (sb->s_blocksize_bits+2));
/*
* incore got set to the group block bitmap below
*/
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug("put bitmap for group %u in page %lu/%x\n",
+ mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
/* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
return err;
}
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int ret = 0;
+ void *bitmap;
+ int blocks_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ struct ext4_group_info *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug(1, "init group %u\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures that we don't reinit the buddy cache
+ * page which map to the group from which we are already
+ * allocating. If we are looking at the buddy cache we would
+ * have taken a reference using ext4_mb_load_buddy and that
+ * would have taken the alloc_sem lock.
+ */
+ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
- mb_debug("load group %u\n", group);
+ mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* groups mapped by the page is blocked
* till we are done with allocation
*/
+repeat_load_buddy:
down_read(e4b->alloc_semp);
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ /* we need to check for group need init flag
+ * with alloc_semp held so that we can be sure
+ * that new blocks didn't get added to the group
+ * when we are loading the buddy cache
+ */
+ up_read(e4b->alloc_semp);
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ return ret;
+ goto repeat_load_buddy;
+ }
+
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->alloc_semp = e4b->alloc_semp;
e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
}
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-
- int ret;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
- struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
-
- mb_debug("init group %lu\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- this_grp = ext4_get_group_info(sb, group);
- /*
- * This ensures we don't add group
- * to this buddy cache via resize
- */
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
- /*
- * somebody initialized the group
- * return without doing anything
- */
- ret = 0;
- goto err;
- }
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
-
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
- /*
- * If both the bitmap and buddy are in
- * the same page we don't need to force
- * init the buddy
- */
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
-err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
- return ret;
-}
-
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
- loff_t size, isize;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ ngroups = sbi->s_blockfile_groups;
+
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
}
bsbits = ac->ac_sb->s_blocksize_bits;
- /* if stream allocation is enabled, use global goal */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
- if (size < sbi->s_mb_stream_request &&
- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* if stream allocation is enabled, use global goal */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
/* TBD: may be hot point */
spin_lock(&sbi->s_md_lock);
ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
+
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -2015,27 +2037,6 @@ repeat:
if (grp->bb_free == 0)
continue;
- /*
- * if the group is already init we check whether it is
- * a good group and if not we don't load the buddy
- */
- if (EXT4_MB_GRP_NEED_INIT(grp)) {
- /*
- * we need full data about the group
- * to make a good selection
- */
- err = ext4_mb_init_group(sb, group);
- if (err)
- goto out;
- }
-
- /*
- * If the particular group doesn't satisfy our
- * criteria we continue with the next group
- */
- if (!ext4_mb_good_group(ac, group, cr))
- continue;
-
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
- "%-5s %-2s %-5s %-5s %-5s %-6s\n",
+ "%-5s %-2s %-6s %-5s %-5s %-6s\n",
"pid", "inode", "original", "goal", "result", "found",
"grps", "cr", "flags", "merge", "tail", "broken");
return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
if (hs->op == EXT4_MB_HISTORY_ALLOC) {
fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
- "%-5u %-5s %-5u %-6u\n";
+ "0x%04x %-5s %-5u %-6u\n";
sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len,
hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_history_ops = {
+static const struct seq_operations ext4_mb_seq_history_ops = {
.start = ext4_mb_seq_history_start,
.next = ext4_mb_seq_history_next,
.stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
return count;
}
-static struct file_operations ext4_mb_seq_history_fops = {
+static const struct file_operations ext4_mb_seq_history_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_history_open,
.read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct ext4_buddy e4b;
struct sg {
struct ext4_group_info info;
- unsigned short counters[16];
+ ext4_grpblk_t counters[16];
} sg;
group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
.start = ext4_mb_seq_groups_start,
.next = ext4_mb_seq_groups_next,
.stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
- meta_group_info[i]->bb_free_root.rb_node = NULL;;
+ meta_group_info[i]->bb_free_root.rb_node = NULL;
#ifdef DOUBLE_CHECK
{
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
-/*
- * Update an existing group.
- * This function is used for online resize
- */
-void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
-{
- grp->bb_free += add;
-}
-
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
- int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int num_meta_group_infos;
int num_meta_group_infos_max;
int array_size;
- struct ext4_group_info **meta_group_info;
struct ext4_group_desc *desc;
/* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freesgi;
}
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
- for (i = 0; i < num_meta_group_infos; i++) {
- if ((i + 1) == num_meta_group_infos)
- metalen = sizeof(*meta_group_info) *
- (ngroups -
- (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
- meta_group_info = kmalloc(metalen, GFP_KERNEL);
- if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
- goto err_freemeta;
- }
- sbi->s_group_info[i] = meta_group_info;
- }
-
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
while (i-- > 0)
kfree(ext4_get_group_info(sb, i));
i = num_meta_group_infos;
-err_freemeta:
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned max;
int ret;
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
return -ENOMEM;
}
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
- mb_debug("mballoc: %u PAs left\n", count);
+ mb_debug(1, "mballoc: %u PAs left\n", count);
}
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
list_for_each_safe(l, ltmp, &txn->t_private_list) {
entry = list_entry(l, struct ext4_free_data, list);
- mb_debug("gonna free %u blocks in group %u (0x%p):",
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
ext4_mb_release_desc(&e4b);
}
- mb_debug("freed %u blocks in %u structures\n", count, count2);
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+ debugfs_dir = debugfs_create_dir("ext4", NULL);
+ if (debugfs_dir)
+ debugfs_debug = debugfs_create_u8("mballoc-debug",
+ S_IRUGO | S_IWUSR,
+ debugfs_dir,
+ &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+ debugfs_remove(debugfs_debug);
+ debugfs_remove(debugfs_dir);
}
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
+ ext4_create_debugfs_entry();
return 0;
}
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
kmem_cache_destroy(ext4_free_ext_cachep);
+ ext4_remove_debugfs_entry();
}
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
else
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug("#%u: goal %u blocks for locality group\n",
+ mb_debug(1, "#%u: goal %u blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
ac->ac_o_ex.fe_logical < pa->pa_lstart));
- /* skip PA normalized request doesn't overlap with */
- if (pa->pa_lstart >= end) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
- if (pa_end <= start) {
+ /* skip PAs this normalized request doesn't overlap with */
+ if (pa->pa_lstart >= end || pa_end <= start) {
spin_unlock(&pa->pa_lock);
continue;
}
BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+ /* adjust start or end to be adjacent to this pa */
if (pa_end <= ac->ac_o_ex.fe_logical) {
BUG_ON(pa_end < start);
start = pa_end;
- }
-
- if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
BUG_ON(pa->pa_lstart > end);
end = pa->pa_lstart;
}
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+ mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
(unsigned) orig_size, (unsigned) start);
}
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
- mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
+ mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
}
/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
}
/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
continue;
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ continue;
+
/* found preallocated blocks, use them */
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
preallocated += len;
count++;
}
- mb_debug("prellocated %u for group %u\n", preallocated, group);
+ mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
- mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
- mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_group_pa(ac, pa);
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
- mb_debug(" free preallocated %u/%u in group %u\n",
+ mb_debug(1, " free preallocated %u/%u in group %u\n",
(unsigned) start, (unsigned) next - bit,
(unsigned) group);
free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
int busy = 0;
int free = 0;
- mb_debug("discard preallocation for group %u\n", group);
+ mb_debug(1, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
return;
}
- mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+ mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
trace_ext4_discard_preallocations(inode);
INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
{
BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
}
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
ext4_get_group_no_and_offset(sb, pa->pa_pstart,
NULL, &start);
spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%lu:%d:%u \n", i,
- start, pa->pa_len);
+ printk(KERN_ERR "PA:%u:%d:%u \n", i,
+ start, pa->pa_len);
}
ext4_unlock_group(sb, i);
if (grp->bb_free == 0)
continue;
- printk(KERN_ERR "%lu: %d/%d \n",
+ printk(KERN_ERR "%u: %d/%d \n",
i, grp->bb_free, grp->bb_fragments);
}
printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
size = max(size, isize);
- /* don't use group allocation for large files */
- if (size >= sbi->s_mb_stream_request)
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
+ }
- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ /* don't use group allocation for large files */
+ if (size >= sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
+ }
BUG_ON(ac->ac_lg != NULL);
/*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct ext4_prealloc_space *pa, *tmp;
struct ext4_allocation_context *ac;
- mb_debug("discard locality group preallocation\n");
+ mb_debug(1, "discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
/*
*/
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...) printk(fmt, ##a)
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...) \
+ do { \
+ if ((n) <= mb_enable_debug) { \
+ printk(KERN_DEBUG "(%s, %d): %s: ", \
+ __FILE__, __LINE__, __func__); \
+ printk(fmt, ## a); \
+ } \
+ } while (0)
#else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
#endif
/*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
unsigned pa_deleted;
ext4_fsblk_t pa_pstart; /* phys. block */
ext4_lblk_t pa_lstart; /* log. block */
- unsigned short pa_len; /* len of preallocated chunk */
- unsigned short pa_free; /* how many blocks are free */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
ext4_lblk_t fe_logical;
ext4_grpblk_t fe_start;
ext4_group_t fe_group;
- int fe_len;
+ ext4_grpblk_t fe_len;
};
/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
down_write(&EXT4_I(inode)->i_data_sem);
/*
- * if EXT4_EXT_MIGRATE is cleared a block allocation
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block allocation
- * via mmap write to holes. If we have allocated new blocks we fail
- * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
- * The flag is updated with i_data_sem held to prevent racing with
- * block allocation.
+ * Even though we take i_mutex we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
tmp_inode->i_nlink = 0;
ext4_journal_stop(handle);
-
+ unlock_new_inode(tmp_inode);
iput(tmp_inode);
return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
#include "ext4_extents.h"
#include "ext4.h"
-#define get_ext_path(path, inode, block, ret) \
- do { \
- path = ext4_ext_find_extent(inode, block, path); \
- if (IS_ERR(path)) { \
- ret = PTR_ERR(path); \
- path = NULL; \
- } \
- } while (0)
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode: an inode which is searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path **path)
+{
+ int ret = 0;
+
+ *path = ext4_ext_find_extent(inode, lblock, *path);
+ if (IS_ERR(*path)) {
+ ret = PTR_ERR(*path);
+ *path = NULL;
+ } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+ ret = -ENODATA;
+
+ return ret;
+}
/**
* copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+ const char *function)
+{
+ int ret = 0;
+
+ if (inode1 == NULL) {
+ ext4_error(inode2->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 NULL inode2 %lu", inode2->i_ino);
+ ret = -EIO;
+ } else if (inode2 == NULL) {
+ ext4_error(inode1->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 %lu inode2 NULL", inode1->i_ino);
+ ret = -EIO;
+ }
+ return ret;
+}
+
+/**
* mext_double_down_read - Acquire two inodes' read semaphore
*
* @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
static void
mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_read(&EXT4_I(orig_inode)->i_data_sem);
up_read(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
static void
mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (new_flag) {
- get_ext_path(orig_path, orig_inode, eblock, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (end_flag) {
- get_ext_path(orig_path, orig_inode,
- le32_to_cpu(end_ext->ee_block) - 1, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* oext |-----------|
* new_ext |-------|
*/
- BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "new_ext_end(%u) should be less than or equal to "
+ "oext->ee_block(%u) + oext_alen(%d) - 1",
+ new_ext_end, le32_to_cpu(oext->ee_block),
+ oext_alen);
+ ret = -EIO;
+ goto out;
+ }
/*
* Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
o_end, &start_ext, &new_ext, &end_ext);
+out:
return ret;
}
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* @orig_off: block offset of original inode
* @donor_off: block offset of donor inode
* @max_count: the maximun length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
*/
-static void
+static int
mext_calc_swap_extents(struct ext4_extent *tmp_dext,
struct ext4_extent *tmp_oext,
ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
ext4_lblk_t diff, orig_diff;
struct ext4_extent dext_old, oext_old;
+ BUG_ON(orig_off != donor_off);
+
+ /* original and donor extents have to cover the same block offset */
+ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+ le32_to_cpu(tmp_oext->ee_block) +
+ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+ return -ENODATA;
+
+ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+ le32_to_cpu(tmp_dext->ee_block) +
+ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+ return -ENODATA;
+
dext_old = *tmp_dext;
oext_old = *tmp_oext;
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
copy_extent_status(&oext_old, tmp_dext);
copy_extent_status(&dext_old, tmp_oext);
+
+ return 0;
}
/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
mext_double_down_write(orig_inode, donor_inode);
/* Get the original extent for the block "orig_off" */
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (err)
goto out;
/* Get the donor extent for the head */
- get_ext_path(donor_path, donor_inode, donor_off, err);
- if (donor_path == NULL)
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
dext = donor_path[depth].p_ext;
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);
+ if (err)
+ goto out;
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+ if (!dext) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "The extent for donor must be found");
+ err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "Donor offset(%u) and the first block of donor "
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+ err = -EIO;
+ goto out;
+ }
/* Set donor extent to orig extent */
err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (donor_path)
ext4_ext_drop_refs(donor_path);
- get_ext_path(donor_path, donor_inode,
- donor_off, err);
- if (donor_path == NULL)
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (err)
goto out;
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
}
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
- donor_off,
- count - replaced_count);
+ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ if (err)
+ goto out;
}
out:
@@ -740,7 +815,7 @@ out:
* on success, or a negative error value on failure.
*/
static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
int block_len_in_page, int uninit)
{
@@ -871,6 +946,7 @@ out:
if (PageLocked(page))
unlock_page(page);
page_cache_release(page);
+ ext4_journal_stop(handle);
}
out2:
ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
struct inode *donor_inode, __u64 orig_start,
__u64 donor_start, __u64 *len, __u64 moved_len)
{
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
/* Regular file check */
if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if ((orig_start > MAX_DEFRAG_SIZE) ||
- (donor_start > MAX_DEFRAG_SIZE) ||
- (*len > MAX_DEFRAG_SIZE) ||
- (orig_start + *len > MAX_DEFRAG_SIZE)) {
- ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
- "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+ if ((orig_start > EXT_MAX_BLOCK) ||
+ (donor_start > EXT_MAX_BLOCK) ||
+ (*len > EXT_MAX_BLOCK) ||
+ (orig_start + *len > EXT_MAX_BLOCK)) {
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if (orig_inode->i_size > donor_inode->i_size) {
- if (orig_start >= donor_inode->i_size) {
+ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start >= donor_blocks) {
ext4_debug("ext4 move extent: orig start offset "
- "[%llu] should be less than donor file size "
- "[%lld] [ino:orig %lu, donor_inode %lu]\n",
- orig_start, donor_inode->i_size,
+ "[%llu] should be less than donor file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, donor_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > donor_inode->i_size) {
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start + *len > donor_blocks) {
ext4_debug("ext4 move extent: End offset [%llu] should "
- "be less than donor file size [%lld]."
- "So adjust length from %llu to %lld "
+ "be less than donor file blocks [%u]."
+ "So adjust length from %llu to %llu "
"[ino:orig %lu, donor %lu]\n",
- orig_start + *len, donor_inode->i_size,
- *len, donor_inode->i_size - orig_start,
+ orig_start + *len, donor_blocks,
+ *len, donor_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = donor_inode->i_size - orig_start;
+ *len = donor_blocks - orig_start;
}
} else {
- if (orig_start >= orig_inode->i_size) {
+ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+ if (orig_start >= orig_blocks) {
ext4_debug("ext4 move extent: start offset [%llu] "
- "should be less than original file size "
- "[%lld] [inode:orig %lu, donor %lu]\n",
- orig_start, orig_inode->i_size,
+ "should be less than original file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, orig_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > orig_inode->i_size) {
+ if (orig_start + *len > orig_blocks) {
ext4_debug("ext4 move extent: Adjust length "
- "from %llu to %lld. Because it should be "
- "less than original file size "
+ "from %llu to %llu. Because it should be "
+ "less than original file blocks "
"[ino:orig %lu, donor %lu]\n",
- *len, orig_inode->i_size - orig_start,
+ *len, orig_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = orig_inode->i_size - orig_start;
+ *len = orig_blocks - orig_start;
}
}
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
- * fs/inode.c.
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
- if (inode1)
- mutex_lock(&inode1->i_mutex);
- else if (inode2)
- mutex_lock(&inode2->i_mutex);
- return;
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
+ if (inode1 == inode2) {
+ mutex_lock(&inode1->i_mutex);
+ goto out;
}
if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
+
+out:
+ return ret;
}
/**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
if (inode1)
mutex_unlock(&inode1->i_mutex);
if (inode2 && inode2 != inode1)
mutex_unlock(&inode2->i_mutex);
+
+out:
+ return ret;
}
/**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret, depth, last_extent = 0;
+ int ret1, ret2, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
/* protect orig and donor against a truncate */
- mext_inode_double_lock(orig_inode, donor_inode);
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+ return ret1;
mext_double_down_read(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len, *moved_len);
mext_double_up_read(orig_inode, donor_inode);
- if (ret)
- goto out2;
+ if (ret1)
+ goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
block_end = block_start + len - 1;
if (file_end < block_end)
len -= block_end - file_end;
- get_ext_path(orig_path, orig_inode, block_start, ret);
- if (orig_path == NULL)
- goto out2;
+ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret1)
+ goto out;
/* Get path structure to check the hole */
- get_ext_path(holecheck_path, orig_inode, block_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret1)
goto out;
depth = ext_depth(orig_inode);
ext_cur = holecheck_path[depth].p_ext;
- if (ext_cur == NULL) {
- ret = -EINVAL;
- goto out;
- }
/*
- * Get proper extent whose ee_block is beyond block_start
- * if block_start was within the hole.
+ * Get proper starting location of block replacement if block_start was
+ * within the hole.
*/
if (le32_to_cpu(ext_cur->ee_block) +
ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ /*
+ * The hole exists between extents or the tail of
+ * original file.
+ */
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
- }
- seq_start = block_start;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+ /* The hole exists at the beginning of original file. */
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ else
+ seq_start = block_start;
/* No blocks within the specified range. */
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret = -EINVAL;
+ ret1 = -EINVAL;
goto out;
}
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
while (orig_page_offset <= seq_end_page) {
/* Swap original branches with new branches */
- ret = move_extent_par_page(o_filp, donor_inode,
+ ret1 = move_extent_per_page(o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
block_len_in_page, uninit);
- if (ret < 0)
+ if (ret1 < 0)
goto out;
orig_page_offset++;
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- BUG_ON(*moved_len > len);
+ if (*moved_len > len) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret1 = -EIO;
+ goto out;
+ }
data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- get_ext_path(holecheck_path, orig_inode,
- seq_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret1)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, seq_start, ret);
- if (orig_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret1)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
-out2:
- mext_inode_double_unlock(orig_inode, donor_inode);
- if (ret)
- return ret;
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
- /* All of the specified blocks must be exchanged in succeed */
- BUG_ON(*moved_len != len);
+ if (ret1)
+ return ret1;
+ else if (ret2)
+ return ret2;
return 0;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 114abe5d2c1d..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
- return make_indexed_dir(handle, dentry, inode, bh);
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ retval = make_indexed_dir(handle, dentry, inode, bh);
+ if (retval == -ENOSPC)
+ brelse(bh);
+ return retval;
+ }
brelse(bh);
}
bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ if (retval == -ENOSPC)
+ brelse(bh);
+ return retval;
}
/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
- node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
+ if (err != -ENOSPC)
+ bh = NULL;
goto cleanup;
journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(inode))
+ if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
/*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= EXT4_LINK_MAX)
+ EXT4_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
struct inode *inode = NULL;
handle_t *handle;
int gdb_off, gdb_num;
- int num_grp_locked = 0;
int err, err2;
gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* using the new disk blocks.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)((char *)primary->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* descriptor
*/
err = ext4_mb_add_groupinfo(sb, input->group, gdp);
- if (err) {
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
+ if (err)
goto exit_journal;
- }
/*
* Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
/* Update the global fs size fields */
sbi->s_groups_count++;
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
ext4_handle_dirty_metadata(handle, NULL, primary);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..df539ba27779 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "mballoc.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
errstr = "Out of memory";
break;
case -EROFS:
- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+ if (!sb || (EXT4_SB(sb)->s_journal &&
+ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
errstr = "Journal has aborted";
else
errstr = "Readonly filesystem";
@@ -962,7 +964,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
-static struct dquot_operations ext4_quota_operations = {
+static const struct dquot_operations ext4_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -983,7 +985,7 @@ static struct dquot_operations ext4_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext4_qctl_operations = {
+static const struct quotactl_ops ext4_qctl_operations = {
.quota_on = ext4_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
*journal_devnum = option;
break;
case Opt_journal_checksum:
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
- break;
+ break; /* Kept for backwards compatibility */
case Opt_journal_async_commit:
set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
break;
case Opt_noload:
set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
- ext4_free_inodes_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
- ext4_free_blks_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
- ext4_used_dirs_count(sb, gdp));
+ atomic_add(ext4_free_inodes_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_add(ext4_free_blks_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(ext4_used_dirs_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].used_dirs);
}
return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
.release = ext4_sb_release,
};
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
+ return 0;
+ }
+
+ if (readonly)
+ return 1;
+
+ /* Check that feature set is OK for a read-write mount */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ return 0;
+ }
+ /*
+ * Large file size enabled file system can only be mounted
+ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (sizeof(blkcnt_t) < sizeof(u64)) {
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+ "cannot be mounted RDWR without "
+ "CONFIG_LBDAF");
+ return 0;
+ }
+ }
+ return 1;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int db_count;
unsigned int i;
int needs_recovery, has_huge_files;
- int features;
__u64 blocks_count;
int err;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
- if (features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
- ~EXT4_FEATURE_INCOMPAT_SUPP));
- goto failed_mount;
- }
- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
- if (!(sb->s_flags & MS_RDONLY) && features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount RDWR because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
- }
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (has_huge_files) {
- /*
- * Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LBDAF
- */
- if (sizeof(root->i_blocks) < sizeof(u64) &&
- !(sb->s_flags & MS_RDONLY)) {
- ext4_msg(sb, KERN_ERR, "Filesystem with huge "
- "files cannot be mounted read-write "
- "without CONFIG_LBDAF");
- goto failed_mount;
- }
- }
+
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_blocks_count(es) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ /*
+ * Test whether we have more sectors than will fit in sector_t,
+ * and whether the max offset is addressable by the page cache.
+ */
+ if ((ext4_blocks_count(es) >
+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+ (ext4_blocks_count(es) >
+ (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
ext4_msg(sb, KERN_ERR, "filesystem"
- " too large to mount safely");
+ " too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+ ret = -EFBIG;
goto failed_mount;
}
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount4;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ jbd2_journal_set_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+ else
jbd2_journal_clear_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else {
- jbd2_journal_clear_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- }
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_mark_recovery_complete(sb, es);
} else {
- int ret;
- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
- ext4_msg(sb, KERN_WARNING, "couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x)",
- (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ /* Make sure we can mount this feature set readwrite */
+ if (!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
-
/*
* Make sure the group descriptor checksums
* are sane. If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
get_bh(new_bh);
} else {
/* We need to allocate a new block */
- ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+ ext4_fsblk_t goal, block;
+
+ goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ block = ext4_new_meta_blocks(handle, inode,
goal, NULL, &error);
if (error)
goto cleanup;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 628235cf44b5..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,21 +35,29 @@
int nr_pdflush_threads;
/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+ long nr_pages;
+ struct super_block *sb;
+ enum writeback_sync_modes sync_mode;
+ int for_kupdate;
+ int range_cyclic;
+};
+
+/*
* Work items for the bdi_writeback threads
*/
struct bdi_work {
- struct list_head list;
- struct list_head wait_list;
- struct rcu_head rcu_head;
+ struct list_head list; /* pending work list */
+ struct rcu_head rcu_head; /* for RCU free/clear of work */
- unsigned long seen;
- atomic_t pending;
+ unsigned long seen; /* threads that have seen this work */
+ atomic_t pending; /* number of threads still to do work */
- struct super_block *sb;
- unsigned long nr_pages;
- enum writeback_sync_modes sync_mode;
+ struct wb_writeback_args args; /* writeback arguments */
- unsigned long state;
+ unsigned long state; /* flag bits, see WS_* */
};
enum {
@@ -66,22 +74,13 @@ static inline bool bdi_work_on_stack(struct bdi_work *work)
}
static inline void bdi_work_init(struct bdi_work *work,
- struct writeback_control *wbc)
+ struct wb_writeback_args *args)
{
INIT_RCU_HEAD(&work->rcu_head);
- work->sb = wbc->sb;
- work->nr_pages = wbc->nr_to_write;
- work->sync_mode = wbc->sync_mode;
+ work->args = *args;
work->state = WS_USED;
}
-static inline void bdi_work_init_on_stack(struct bdi_work *work,
- struct writeback_control *wbc)
-{
- bdi_work_init(work, wbc);
- work->state |= WS_ONSTACK;
-}
-
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
@@ -98,6 +97,11 @@ static void bdi_work_clear(struct bdi_work *work)
{
clear_bit(WS_USED_B, &work->state);
smp_mb__after_clear_bit();
+ /*
+ * work can have disappeared at this point. bit waitq functions
+ * should be able to tolerate this, provided bdi_sched_wait does
+ * not dereference it's pointer argument.
+ */
wake_up_bit(&work->state, WS_USED_B);
}
@@ -113,7 +117,8 @@ static void bdi_work_free(struct rcu_head *head)
static void wb_work_complete(struct bdi_work *work)
{
- const enum writeback_sync_modes sync_mode = work->sync_mode;
+ const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+ int onstack = bdi_work_on_stack(work);
/*
* For allocated work, we can clear the done/seen bit right here.
@@ -121,9 +126,9 @@ static void wb_work_complete(struct bdi_work *work)
* to after the RCU grace period, since the stack could be invalidated
* as soon as bdi_work_clear() has done the wakeup.
*/
- if (!bdi_work_on_stack(work))
+ if (!onstack)
bdi_work_clear(work);
- if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
+ if (sync_mode == WB_SYNC_NONE || onstack)
call_rcu(&work->rcu_head, bdi_work_free);
}
@@ -146,21 +151,19 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
{
- if (work) {
- work->seen = bdi->wb_mask;
- BUG_ON(!work->seen);
- atomic_set(&work->pending, bdi->wb_cnt);
- BUG_ON(!bdi->wb_cnt);
-
- /*
- * Make sure stores are seen before it appears on the list
- */
- smp_mb();
+ work->seen = bdi->wb_mask;
+ BUG_ON(!work->seen);
+ atomic_set(&work->pending, bdi->wb_cnt);
+ BUG_ON(!bdi->wb_cnt);
- spin_lock(&bdi->wb_lock);
- list_add_tail_rcu(&work->list, &bdi->work_list);
- spin_unlock(&bdi->wb_lock);
- }
+ /*
+ * list_add_tail_rcu() contains the necessary barriers to
+ * make sure the above stores are seen before the item is
+ * noticed on the list
+ */
+ spin_lock(&bdi->wb_lock);
+ list_add_tail_rcu(&work->list, &bdi->work_list);
+ spin_unlock(&bdi->wb_lock);
/*
* If the default thread isn't there, make sure we add it. When
@@ -171,15 +174,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
else {
struct bdi_writeback *wb = &bdi->wb;
- /*
- * If we failed allocating the bdi work item, wake up the wb
- * thread always. As a safety precaution, it'll flush out
- * everything
- */
- if (!wb_has_dirty_io(wb)) {
- if (work)
- wb_clear_pending(wb, work);
- } else if (wb->task)
+ if (wb->task)
wake_up_process(wb->task);
}
}
@@ -194,48 +189,75 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
TASK_UNINTERRUPTIBLE);
}
-static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_args *args)
{
struct bdi_work *work;
+ /*
+ * This is WB_SYNC_NONE writeback, so if allocation fails just
+ * wakeup the thread for old dirty data writeback
+ */
work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work)
- bdi_work_init(work, wbc);
+ if (work) {
+ bdi_work_init(work, args);
+ bdi_queue_work(bdi, work);
+ } else {
+ struct bdi_writeback *wb = &bdi->wb;
- return work;
+ if (wb->task)
+ wake_up_process(wb->task);
+ }
}
-void bdi_start_writeback(struct writeback_control *wbc)
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ * This does WB_SYNC_ALL data integrity writeback and waits for the
+ * IO to complete. Callers must hold the sb s_umount semaphore for
+ * reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+ struct super_block *sb)
{
- const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
- struct bdi_work work_stack, *work = NULL;
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_ALL,
+ .nr_pages = LONG_MAX,
+ .range_cyclic = 0,
+ };
+ struct bdi_work work;
- if (!must_wait)
- work = bdi_alloc_work(wbc);
+ bdi_work_init(&work, &args);
+ work.state |= WS_ONSTACK;
- if (!work) {
- work = &work_stack;
- bdi_work_init_on_stack(work, wbc);
- }
+ bdi_queue_work(bdi, &work);
+ bdi_wait_on_work_clear(&work);
+}
- bdi_queue_work(wbc->bdi, work);
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ * This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ * started when this function returns, we make no guarentees on
+ * completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+{
+ struct wb_writeback_args args = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_pages = nr_pages,
+ .range_cyclic = 1,
+ };
- /*
- * If the sync mode is WB_SYNC_ALL, block waiting for the work to
- * complete. If not, we only need to wait for the work to be started,
- * if we allocated it on-stack. We use the same mechanism, if the
- * wait bit is set in the bdi_work struct, then threads will not
- * clear pending until after they are done.
- *
- * Note that work == &work_stack if must_wait is true, so we don't
- * need to do call_rcu() here ever, since the completion path will
- * have done that for us.
- */
- if (must_wait || work == &work_stack) {
- bdi_wait_on_work_clear(work);
- if (work != &work_stack)
- call_rcu(&work->rcu_head, bdi_work_free);
- }
+ bdi_alloc_queue_work(bdi, &args);
}
/*
@@ -671,17 +693,16 @@ static inline bool over_bground_thresh(void)
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
-static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
- struct super_block *sb,
- enum writeback_sync_modes sync_mode, int for_kupdate)
+static long wb_writeback(struct bdi_writeback *wb,
+ struct wb_writeback_args *args)
{
struct writeback_control wbc = {
.bdi = wb->bdi,
- .sb = sb,
- .sync_mode = sync_mode,
+ .sb = args->sb,
+ .sync_mode = args->sync_mode,
.older_than_this = NULL,
- .for_kupdate = for_kupdate,
- .range_cyclic = 1,
+ .for_kupdate = args->for_kupdate,
+ .range_cyclic = args->range_cyclic,
};
unsigned long oldest_jif;
long wrote = 0;
@@ -691,13 +712,18 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
}
+ if (!wbc.range_cyclic) {
+ wbc.range_start = 0;
+ wbc.range_end = LLONG_MAX;
+ }
for (;;) {
/*
* Don't flush anything for non-integrity writeback where
* no nr_pages was given
*/
- if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
+ if (!args->for_kupdate && args->nr_pages <= 0 &&
+ args->sync_mode == WB_SYNC_NONE)
break;
/*
@@ -705,7 +731,8 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
* periodic background writeout and we are below the
* background dirty threshold, don't do anything
*/
- if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
+ if (args->for_kupdate && args->nr_pages <= 0 &&
+ !over_bground_thresh())
break;
wbc.more_io = 0;
@@ -713,7 +740,7 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
writeback_inodes_wb(wb, &wbc);
- nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
/*
@@ -731,7 +758,11 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
/*
* Return the next bdi_work struct that hasn't been processed by this
- * wb thread yet
+ * wb thread yet. ->seen is initially set for each thread that exists
+ * for this device, when a thread first notices a piece of work it
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
*/
static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
struct bdi_writeback *wb)
@@ -741,8 +772,9 @@ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
rcu_read_lock();
list_for_each_entry_rcu(work, &bdi->work_list, list) {
- if (!test_and_clear_bit(wb->nr, &work->seen))
+ if (!test_bit(wb->nr, &work->seen))
continue;
+ clear_bit(wb->nr, &work->seen);
ret = work;
break;
@@ -767,8 +799,16 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
- if (nr_pages)
- return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
+ if (nr_pages) {
+ struct wb_writeback_args args = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .for_kupdate = 1,
+ .range_cyclic = 1,
+ };
+
+ return wb_writeback(wb, &args);
+ }
return 0;
}
@@ -780,35 +820,31 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
struct bdi_work *work;
- long nr_pages, wrote = 0;
+ long wrote = 0;
while ((work = get_next_work_item(bdi, wb)) != NULL) {
- enum writeback_sync_modes sync_mode;
-
- nr_pages = work->nr_pages;
+ struct wb_writeback_args args = work->args;
/*
* Override sync mode, in case we must wait for completion
*/
if (force_wait)
- work->sync_mode = sync_mode = WB_SYNC_ALL;
- else
- sync_mode = work->sync_mode;
+ work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
/*
* If this isn't a data integrity operation, just notify
* that we have seen this work and we are now starting it.
*/
- if (sync_mode == WB_SYNC_NONE)
+ if (args.sync_mode == WB_SYNC_NONE)
wb_clear_pending(wb, work);
- wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
+ wrote += wb_writeback(wb, &args);
/*
* This is a data integrity writeback, so only do the
* notification when we have completed the work.
*/
- if (sync_mode == WB_SYNC_ALL)
+ if (args.sync_mode == WB_SYNC_ALL)
wb_clear_pending(wb, work);
}
@@ -849,8 +885,7 @@ int bdi_writeback_task(struct bdi_writeback *wb)
}
wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(wait_jiffies);
+ schedule_timeout_interruptible(wait_jiffies);
try_to_freeze();
}
@@ -858,67 +893,28 @@ int bdi_writeback_task(struct bdi_writeback *wb)
}
/*
- * Schedule writeback for all backing devices. Expensive! If this is a data
- * integrity operation, writeback will be complete when this returns. If
- * we are simply called for WB_SYNC_NONE, then writeback will merely be
- * scheduled to run.
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
*/
-static void bdi_writeback_all(struct writeback_control *wbc)
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
{
- const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ };
struct backing_dev_info *bdi;
- struct bdi_work *work;
- LIST_HEAD(list);
-
-restart:
- spin_lock(&bdi_lock);
- list_for_each_entry(bdi, &bdi_list, bdi_list) {
- struct bdi_work *work;
+ rcu_read_lock();
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
continue;
- /*
- * If work allocation fails, do the writes inline. We drop
- * the lock and restart the list writeout. This should be OK,
- * since this happens rarely and because the writeout should
- * eventually make more free memory available.
- */
- work = bdi_alloc_work(wbc);
- if (!work) {
- struct writeback_control __wbc;
-
- /*
- * Not a data integrity writeout, just continue
- */
- if (!must_wait)
- continue;
-
- spin_unlock(&bdi_lock);
- __wbc = *wbc;
- __wbc.bdi = bdi;
- writeback_inodes_wbc(&__wbc);
- goto restart;
- }
- if (must_wait)
- list_add_tail(&work->wait_list, &list);
-
- bdi_queue_work(bdi, work);
+ bdi_alloc_queue_work(bdi, &args);
}
- spin_unlock(&bdi_lock);
-
- /*
- * If this is for WB_SYNC_ALL, wait for pending work to complete
- * before returning.
- */
- while (!list_empty(&list)) {
- work = list_entry(list.next, struct bdi_work, wait_list);
- list_del(&work->wait_list);
- bdi_wait_on_work_clear(work);
- call_rcu(&work->rcu_head, bdi_work_free);
- }
+ rcu_read_unlock();
}
/*
@@ -927,17 +923,10 @@ restart:
*/
void wakeup_flusher_threads(long nr_pages)
{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .range_cyclic = 1,
- };
-
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- wbc.nr_to_write = nr_pages;
- bdi_writeback_all(&wbc);
+ bdi_writeback_all(NULL, nr_pages);
}
static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1084,7 +1073,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
* on the writer throttling path, and we get decent balancing between many
* throttled threads: we don't want them all piling up on inode_sync_wait.
*/
-static void wait_sb_inodes(struct writeback_control *wbc)
+static void wait_sb_inodes(struct super_block *sb)
{
struct inode *inode, *old_inode = NULL;
@@ -1092,7 +1081,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
* We need to be protected against the filesystem going from
* r/o to r/w or vice versa.
*/
- WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
spin_lock(&inode_lock);
@@ -1103,7 +1092,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
* In which case, the inode may not be on the dirty list, but
* we still have to wait for that writeout.
*/
- list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
struct address_space *mapping;
if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
@@ -1143,14 +1132,8 @@ static void wait_sb_inodes(struct writeback_control *wbc)
* for IO completion of submitted IO. The number of pages submitted is
* returned.
*/
-long writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb)
{
- struct writeback_control wbc = {
- .sb = sb,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
long nr_to_write;
@@ -1158,9 +1141,7 @@ long writeback_inodes_sb(struct super_block *sb)
nr_to_write = nr_dirty + nr_unstable +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
- wbc.nr_to_write = nr_to_write;
- bdi_writeback_all(&wbc);
- return nr_to_write - wbc.nr_to_write;
+ bdi_writeback_all(sb, nr_to_write);
}
EXPORT_SYMBOL(writeback_inodes_sb);
@@ -1171,20 +1152,10 @@ EXPORT_SYMBOL(writeback_inodes_sb);
* This function writes and waits on any dirty inode belonging to this
* super_block. The number of pages synced is returned.
*/
-long sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb)
{
- struct writeback_control wbc = {
- .sb = sb,
- .sync_mode = WB_SYNC_ALL,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
- long nr_to_write = LONG_MAX; /* doesn't actually matter */
-
- wbc.nr_to_write = nr_to_write;
- bdi_writeback_all(&wbc);
- wait_sb_inodes(&wbc);
- return nr_to_write - wbc.nr_to_write;
+ bdi_sync_writeback(sb->s_bdi, sb);
+ wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
return simple_read_from_buffer(buf, len, ppos, tmp, size);
}
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos, unsigned val)
+{
+ char tmp[32];
+ size_t size = sprintf(tmp, "%u\n", val);
+
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, unsigned *val,
+ unsigned global_limit)
+{
+ unsigned long t;
+ char tmp[32];
+ unsigned limit = (1 << 16) - 1;
+ int err;
+
+ if (*ppos || count >= sizeof(tmp) - 1)
+ return -EINVAL;
+
+ if (copy_from_user(tmp, buf, count))
+ return -EINVAL;
+
+ tmp[count] = '\0';
+
+ err = strict_strtoul(tmp, 0, &t);
+ if (err)
+ return err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ limit = min(limit, global_limit);
+
+ if (t > limit)
+ return -EINVAL;
+
+ *val = t;
+
+ return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->max_background;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_bgreq);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->max_background = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->congestion_threshold;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_congthresh);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->congestion_threshold = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
static const struct file_operations fuse_ctl_abort_ops = {
.open = nonseekable_open,
.write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
.read = fuse_conn_waiting_read,
};
+static const struct file_operations fuse_conn_max_background_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_max_background_read,
+ .write = fuse_conn_max_background_write,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_congestion_threshold_read,
+ .write = fuse_conn_congestion_threshold_write,
+};
+
static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct fuse_conn *fc,
const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
goto err;
if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
- NULL, &fuse_ctl_waiting_ops) ||
+ NULL, &fuse_ctl_waiting_ops) ||
!fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
- NULL, &fuse_ctl_abort_ops))
+ NULL, &fuse_ctl_abort_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+ 1, NULL, &fuse_conn_max_background_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+ S_IFREG | 0600, 1, NULL,
+ &fuse_conn_congestion_threshold_ops))
goto err;
return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
d_drop(dentry);
dput(dentry);
}
- fuse_control_sb->s_root->d_inode->i_nlink--;
+ drop_nlink(fuse_control_sb->s_root->d_inode);
}
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
static void flush_bg_queue(struct fuse_conn *fc)
{
- while (fc->active_background < FUSE_MAX_BACKGROUND &&
+ while (fc->active_background < fc->max_background &&
!list_empty(&fc->bg_queue)) {
struct fuse_req *req;
@@ -280,11 +280,11 @@ __releases(&fc->lock)
list_del(&req->intr_entry);
req->state = FUSE_REQ_FINISHED;
if (req->background) {
- if (fc->num_background == FUSE_MAX_BACKGROUND) {
+ if (fc->num_background == fc->max_background) {
fc->blocked = 0;
wake_up_all(&fc->blocked_waitq);
}
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->connected && fc->bdi_initialized) {
clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
{
req->background = 1;
fc->num_background++;
- if (fc->num_background == FUSE_MAX_BACKGROUND)
+ if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->bdi_initialized) {
set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
-
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
@@ -38,7 +32,7 @@
#define FUSE_NAME_MAX 1024
/** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 3
+#define FUSE_CTL_NUM_DENTRIES 5
/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
/** Global mutex protecting fuse_conn_list and the control filesystem */
extern struct mutex fuse_mutex;
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
/** FUSE inode */
struct fuse_inode {
/** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
+ /** Maximum number of outstanding background requests */
+ unsigned max_background;
+
+ /** Number of background requests at which congestion starts */
+ unsigned congestion_threshold;
+
/** Number of requests currently in the background */
unsigned num_background;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4567db6f9430..6da947daabda 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/parser.h>
#include <linux/statfs.h>
#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+ &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+ &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
#define FUSE_SUPER_MAGIC 0x65735546
#define FUSE_DEFAULT_BLKSIZE 512
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
struct fuse_mount_data {
int fd;
unsigned rootmode;
@@ -517,6 +542,8 @@ void fuse_conn_init(struct fuse_conn *fc)
INIT_LIST_HEAD(&fc->bg_queue);
INIT_LIST_HEAD(&fc->entry);
atomic_set(&fc->num_waiting, 0);
+ fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+ fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
fc->khctr = 0;
fc->polled_files = RB_ROOT;
fc->reqctr = 0;
@@ -727,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
.show_options = fuse_show_options,
};
+static void sanitize_global_limit(unsigned *limit)
+{
+ if (*limit == 0)
+ *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+ sizeof(struct fuse_req);
+
+ if (*limit >= 1 << 16)
+ *limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+ int rv;
+
+ rv = param_set_uint(val, kp);
+ if (rv)
+ return rv;
+
+ sanitize_global_limit((unsigned *)kp->arg);
+
+ return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+ int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+ if (arg->minor < 13)
+ return;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ if (arg->max_background) {
+ fc->max_background = arg->max_background;
+
+ if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+ fc->max_background = max_user_bgreq;
+ }
+ if (arg->congestion_threshold) {
+ fc->congestion_threshold = arg->congestion_threshold;
+
+ if (!cap_sys_admin &&
+ fc->congestion_threshold > max_user_congthresh)
+ fc->congestion_threshold = max_user_congthresh;
+ }
+}
+
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
else {
unsigned long ra_pages;
+ process_init_limits(fc, arg);
+
if (arg->minor >= 6) {
ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
if (arg->flags & FUSE_ASYNC_READ)
@@ -894,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_put_conn;
+ sb->s_bdi = &fc->bdi;
+
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1148,6 +1227,9 @@ static int __init fuse_init(void)
if (res)
goto err_sysfs_cleanup;
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
return 0;
err_sysfs_cleanup:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a93b885311d8..06b7c2623f99 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -507,6 +507,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
INIT_LIST_HEAD(&inode->i_mapping->private_list);
info = HUGETLBFS_I(inode);
+ /*
+ * The policy is initialized here even if we are creating a
+ * private inode because initialization simply creates an
+ * an empty rb tree and calls spin_lock_init(), later when we
+ * call mpol_free_shared_policy() it will just return because
+ * the rb tree will still be empty.
+ */
mpol_shared_policy_init(&info->policy, NULL);
switch (mode & S_IFMT) {
default:
@@ -931,13 +938,19 @@ static struct file_system_type hugetlbfs_fs_type = {
static struct vfsmount *hugetlbfs_vfsmount;
-static int can_do_hugetlb_shm(void)
+static int can_do_hugetlb_shm(int creat_flags)
{
- return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
+ if (creat_flags != HUGETLB_SHMFS_INODE)
+ return 0;
+ if (capable(CAP_IPC_LOCK))
+ return 1;
+ if (in_group_p(sysctl_hugetlb_shm_group))
+ return 1;
+ return 0;
}
struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
- struct user_struct **user)
+ struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
struct file *file;
@@ -949,7 +962,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
if (!hugetlbfs_vfsmount)
return ERR_PTR(-ENOENT);
- if (!can_do_hugetlb_shm()) {
+ if (!can_do_hugetlb_shm(creat_flags)) {
*user = current_user();
if (user_shm_lock(size, *user)) {
WARN_ONCE(1,
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..f5ff71cb3e2b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -123,7 +123,7 @@ static void wake_up_inode(struct inode *inode)
int inode_init_always(struct super_block *sb, struct inode *inode)
{
static const struct address_space_operations empty_aops;
- static struct inode_operations empty_iops;
+ static const struct inode_operations empty_iops;
static const struct file_operations empty_fops;
struct address_space *const mapping = &inode->i_data;
@@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (sb->s_bdev) {
struct backing_dev_info *bdi;
- bdi = sb->s_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
mapping->backing_dev_info = bdi;
}
inode->i_private = NULL;
@@ -697,13 +695,15 @@ void unlock_new_inode(struct inode *inode)
}
#endif
/*
- * This is special! We do not need the spinlock
- * when clearing I_LOCK, because we're guaranteed
- * that nobody else tries to do anything about the
- * state of the inode when it is locked, as we
- * just created it (so there can be no old holders
- * that haven't tested I_LOCK).
+ * This is special! We do not need the spinlock when clearing I_LOCK,
+ * because we're guaranteed that nobody else tries to do anything about
+ * the state of the inode when it is locked, as we just created it (so
+ * there can be no old holders that haven't tested I_LOCK).
+ * However we must emit the memory barrier so that other CPUs reliably
+ * see the clearing of I_LOCK after the other inode initialisation has
+ * completed.
*/
+ smp_mb();
WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
inode->i_state &= ~(I_LOCK|I_NEW);
wake_up_inode(inode);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
- unsigned long blocknr, freed;
+ unsigned int blocknr, freed;
if (is_journal_aborted(journal))
return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %lu), "
- "freeing %lu\n",
+ "Cleaning journal tail from %d to %d (offset %u), "
+ "freeing %u\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
int bufs;
int flags;
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
ktime_t start_time;
u64 commit_time;
char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
int journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
struct journal_head **jh_out,
- unsigned long blocknr)
+ unsigned int blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
* Log buffer allocation routines:
*/
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
{
- unsigned long blocknr;
+ unsigned int blocknr;
spin_lock(&journal->j_state_lock);
J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
* this is a no-op. If needed, we can use j_blk_offset - everything is
* ready.
*/
-int journal_bmap(journal_t *journal, unsigned long blocknr,
- unsigned long *retp)
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+ unsigned int *retp)
{
int err = 0;
- unsigned long ret;
+ unsigned int ret;
if (journal->j_inode) {
ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
char b[BDEVNAME_SIZE];
printk(KERN_ALERT "%s: journal block not found "
- "at offset %lu on %s\n",
+ "at offset %u on %s\n",
__func__,
blocknr,
bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
journal_t *journal = journal_init_common();
int err;
int n;
- unsigned long blocknr;
+ unsigned int blocknr;
if (!journal)
return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
static int journal_reset(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
- unsigned long first, last;
+ unsigned int first, last;
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
- printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+ printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
first, last);
journal_fail_superblock(journal);
return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
**/
int journal_create(journal_t *journal)
{
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
journal_superblock_t *sb;
int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
if (sb->s_start == 0 && journal->j_tail_sequence ==
journal->j_transaction_sequence) {
jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
- "(start %ld, seq %d, errno %d)\n",
+ "(start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
goto out;
}
spin_lock(&journal->j_state_lock);
- jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence, journal->j_errno);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
- unsigned long old_tail;
+ unsigned int old_tail;
spin_lock(&journal->j_state_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
*bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
- unsigned long next_log_block;
+ unsigned int next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
if (tid_geq(next_commit_ID, info->end_transaction))
break;
- jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ jbd_debug(3, "JBD: checking block %u\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
- unsigned long io_block;
+ unsigned int io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
- "block %ld in log\n",
+ "block %u in log\n",
err, io_block);
} else {
- unsigned long blocknr;
+ unsigned int blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
max = be32_to_cpu(header->r_count);
while (offset < max) {
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
- unsigned long blocknr;
+ unsigned int blocknr;
};
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
/* Utility functions to maintain the revoke table */
/* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
(block << (hash_shift - 12))) & (table->hash_size - 1);
}
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
tid_t seq)
{
struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
/* Find a revoke record in the journal's hash table. */
static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
- unsigned long blocknr)
+ unsigned int blocknr)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
* by one.
*/
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
}
}
- jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+ jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
*/
int journal_set_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
*/
int journal_test_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
spin_lock_init(&transaction->t_handle_lock);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires =
+ round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
__log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
-
out:
return handle;
}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ !JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
set_buffer_ordered(bh);
barrier_done = 1;
}
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
.nr_to_write = mapping->nrpages * 2,
.range_start = 0,
.range_end = i_size_read(mapping->host),
- .for_writepages = 1,
};
ret = generic_writepages(mapping, &wbc);
@@ -707,11 +707,13 @@ start_journal_io:
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
+ if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(journal->j_dev, NULL);
}
/*
@@ -834,7 +836,7 @@ wait_for_iobuf:
jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
+ if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+ printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+ first, last);
+ journal_fail_superblock(journal);
+ return -EINVAL;
+ }
journal->j_first = first;
journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
__jbd2_log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
out:
return handle;
}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
__jbd2_log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0035c021395a..9a80e8e595d0 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
}
-static struct export_operations jffs2_export_ops = {
+static const struct export_operations jffs2_export_ops = {
.get_parent = jffs2_get_parent,
.fh_to_dentry = jffs2_fh_to_dentry,
.fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4336adba952a..c81249fef11f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
}
-static struct file_lock_operations nlmclnt_lock_ops = {
+static const struct file_lock_operations nlmclnt_lock_ops = {
.fl_copy_lock = nlmclnt_locks_copy_lock,
.fl_release_private = nlmclnt_locks_release_private,
};
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e577a78d7bac..d1001790fa9a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
}
-struct lock_manager_operations nlmsvc_lock_operations = {
+const struct lock_manager_operations nlmsvc_lock_operations = {
.fl_compare_owner = nlmsvc_same_owner,
.fl_notify = nlmsvc_notify_blocked,
.fl_grant = nlmsvc_grant_deferred,
diff --git a/fs/locks.c b/fs/locks.c
index 19ee18a6829b..a8794f233bc9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
return fl->fl_file == try->fl_file;
}
-static struct lock_manager_operations lease_manager_ops = {
+static const struct lock_manager_operations lease_manager_ops = {
.fl_break = lease_break_callback,
.fl_release_private = lease_release_private_callback,
.fl_mylease = lease_mylease_callback,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e350bd6a2334..a7ce15d3c248 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -933,10 +933,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
goto out_error;
nfs_server_set_fsinfo(server, &fsinfo);
- error = bdi_init(&server->backing_dev_info);
- if (error)
- goto out_error;
-
/* Get some general file system info */
if (server->namelen == 0) {
@@ -995,6 +991,12 @@ static struct nfs_server *nfs_alloc_server(void)
return NULL;
}
+ if (bdi_init(&server->backing_dev_info)) {
+ nfs_free_iostats(server->io_stats);
+ kfree(server);
+ return NULL;
+ }
+
return server;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1434080aefeb..2ef4fecf3984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl)
nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
}
-static struct file_lock_operations nfs4_fl_lock_ops = {
+static const struct file_lock_operations nfs4_fl_lock_ops = {
.fl_copy_lock = nfs4_fl_copy_lock,
.fl_release_private = nfs4_fl_release_lock,
};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 867f70504531..f1cc0587cfef 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1918,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
if (server->flags & NFS_MOUNT_NOAC)
sb->s_flags |= MS_SYNCHRONOUS;
+ sb->s_bdi = &server->backing_dev_info;
+
nfs_super_set_maxbytes(sb, server->maxfilesize);
}
@@ -2188,8 +2190,8 @@ static void nfs_kill_super(struct super_block *s)
{
struct nfs_server *server = NFS_SB(s);
- bdi_unregister(&server->backing_dev_info);
kill_anon_super(s);
+ bdi_unregister(&server->backing_dev_info);
nfs_fscache_release_super_cookie(s);
nfs_free_server(server);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 120acadc6a84..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1490,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
.nr_to_write = LONG_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .for_writepages = 1,
};
return __nfs_write_mapping(mapping, &wbc, how);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..766d3d544544 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2163,7 +2163,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
return -EAGAIN;
}
-static struct lock_manager_operations nfsd_lease_mng_ops = {
+static const struct lock_manager_operations nfsd_lease_mng_ops = {
.fl_break = nfsd_break_deleg_cb,
.fl_release_private = nfsd_release_deleg_cb,
.fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -3368,7 +3368,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
/* Hack!: For now, we're defining this just so we can use a pointer to it
* as a unique cookie to identify our (NFSv4's) posix locks. */
-static struct lock_manager_operations nfsd_posix_mng_ops = {
+static const struct lock_manager_operations nfsd_posix_mng_ops = {
};
static inline void
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index c668bca579c1..6a2711f4c321 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,7 +46,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
}
-static struct address_space_operations def_btnode_aops = {
+static const struct address_space_operations def_btnode_aops = {
.sync_page = block_sync_page,
};
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 6bd84a0d8238..fc8278c77cdd 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = {
.splice_read = generic_file_splice_read,
};
-struct inode_operations nilfs_file_inode_operations = {
+const struct inode_operations nilfs_file_inode_operations = {
.truncate = nilfs_truncate,
.setattr = nilfs_setattr,
.permission = nilfs_permission,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1b3c2bb20da9..e6de0a27ab5d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,7 +52,7 @@
#include "dat.h"
#include "ifile.h"
-static struct address_space_operations def_gcinode_aops = {
+static const struct address_space_operations def_gcinode_aops = {
.sync_page = block_sync_page,
};
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 807e584b163d..2d2c501deb54 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
return size;
}
-struct address_space_operations nilfs_aops = {
+const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
.readpage = nilfs_readpage,
.sync_page = block_sync_page,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 156bf6091a96..b18c4998f8d0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -427,12 +427,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
}
-static struct address_space_operations def_mdt_aops = {
+static const struct address_space_operations def_mdt_aops = {
.writepage = nilfs_mdt_write_page,
.sync_page = block_sync_page,
};
-static struct inode_operations def_mdt_iops;
+static const struct inode_operations def_mdt_iops;
static struct file_operations def_mdt_fops;
/*
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index df70dadb336f..ed02e886fa79 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -448,7 +448,7 @@ out:
return err;
}
-struct inode_operations nilfs_dir_inode_operations = {
+const struct inode_operations nilfs_dir_inode_operations = {
.create = nilfs_create,
.lookup = nilfs_lookup,
.link = nilfs_link,
@@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = {
.permission = nilfs_permission,
};
-struct inode_operations nilfs_special_inode_operations = {
+const struct inode_operations nilfs_special_inode_operations = {
.setattr = nilfs_setattr,
.permission = nilfs_permission,
};
-struct inode_operations nilfs_symlink_inode_operations = {
+const struct inode_operations nilfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 724c63766e82..bad7368782d0 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -295,12 +295,12 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
* Inodes and files operations
*/
extern struct file_operations nilfs_dir_operations;
-extern struct inode_operations nilfs_file_inode_operations;
+extern const struct inode_operations nilfs_file_inode_operations;
extern struct file_operations nilfs_file_operations;
-extern struct address_space_operations nilfs_aops;
-extern struct inode_operations nilfs_dir_inode_operations;
-extern struct inode_operations nilfs_special_inode_operations;
-extern struct inode_operations nilfs_symlink_inode_operations;
+extern const struct address_space_operations nilfs_aops;
+extern const struct inode_operations nilfs_dir_inode_operations;
+extern const struct inode_operations nilfs_special_inode_operations;
+extern const struct inode_operations nilfs_symlink_inode_operations;
/*
* filesystem type
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 55f3d6b60732..644e66727dd0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -504,7 +504,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
return 0;
}
-static struct super_operations nilfs_sops = {
+static const struct super_operations nilfs_sops = {
.alloc_inode = nilfs_alloc_inode,
.destroy_inode = nilfs_destroy_inode,
.dirty_inode = nilfs_dirty_inode,
@@ -560,7 +560,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
nilfs_nfs_get_inode);
}
-static struct export_operations nilfs_export_ops = {
+static const struct export_operations nilfs_export_ops = {
.fh_to_dentry = nilfs_fh_to_dentry,
.fh_to_parent = nilfs_fh_to_parent,
.get_parent = nilfs_get_parent,
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d4168e269c5d..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -591,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
- bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
/* Finding last segment */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3cd..a44b14cbceeb 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
/* return (void *)__get_free_page(gfp_mask); */
}
- if (likely(size >> PAGE_SHIFT < num_physpages))
+ if (likely((size >> PAGE_SHIFT) < totalram_pages))
return __vmalloc(size, gfp_mask, PAGE_KERNEL);
return NULL;
}
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 3fb96fcd4c81..e5df9d170b0c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
struct buffer_head **bh);
-extern struct dquot_operations ocfs2_quota_operations;
+extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
int ocfs2_quota_setup(void);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 44f2a5e1d042..3af4954d537b 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -849,7 +849,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
kmem_cache_free(ocfs2_dquot_cachep, dquot);
}
-struct dquot_operations ocfs2_quota_operations = {
+const struct dquot_operations ocfs2_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a3f8871d21fd..faca4720aa47 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
-static struct quotactl_ops ocfs2_quotactl_ops = {
+static const struct quotactl_ops ocfs2_quotactl_ops = {
.quota_on = ocfs2_quota_on,
.quota_off = ocfs2_quota_off,
.quota_sync = vfs_quota_sync,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c7275cfbdcfb..3680bae335b5 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -489,7 +489,7 @@ out:
return ret;
}
-struct inode_operations omfs_dir_inops = {
+const struct inode_operations omfs_dir_inops = {
.lookup = omfs_lookup,
.mkdir = omfs_mkdir,
.rename = omfs_rename,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d17e774eaf45..4845fbb18e6e 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = {
.splice_read = generic_file_splice_read,
};
-struct inode_operations omfs_file_inops = {
+const struct inode_operations omfs_file_inops = {
.truncate = omfs_truncate
};
-struct address_space_operations omfs_aops = {
+const struct address_space_operations omfs_aops = {
.readpage = omfs_readpage,
.readpages = omfs_readpages,
.writepage = omfs_writepage,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 379ae5fb4411..f3b7c1541f3a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct super_operations omfs_sops = {
+static const struct super_operations omfs_sops = {
.write_inode = omfs_write_inode,
.delete_inode = omfs_delete_inode,
.put_super = omfs_put_super,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 2bc0f0670406..df71039945ac 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -45,15 +45,15 @@ extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
/* dir.c */
extern struct file_operations omfs_dir_operations;
-extern struct inode_operations omfs_dir_inops;
+extern const struct inode_operations omfs_dir_inops;
extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
u64 fsblock);
/* file.c */
extern struct file_operations omfs_file_operations;
-extern struct inode_operations omfs_file_inops;
-extern struct address_space_operations omfs_aops;
+extern const struct inode_operations omfs_file_inops;
+extern const struct address_space_operations omfs_aops;
extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
extern int omfs_shrink_inode(struct inode *inode);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 619ba99dfe39..7b685e10cbad 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute_group part_attr_group = {
.attrs = part_attrs,
};
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
&part_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
&blk_trace_attr_group,
@@ -581,7 +581,7 @@ try_scan:
}
if (from + size > get_capacity(disk)) {
- struct block_device_operations *bdops = disk->fops;
+ const struct block_device_operations *bdops = disk->fops;
unsigned long long capacity;
printk(KERN_WARNING
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6658a9..55c4c805a756 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
do_posix_clock_monotonic_gettime(&uptime);
read_lock(&tasklist_lock);
- points = badness(task, uptime.tv_sec);
+ points = badness(task->group_leader, uptime.tv_sec);
read_unlock(&tasklist_lock);
return sprintf(buffer, "%lu\n", points);
}
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
char buffer[PROC_NUMBUF];
size_t len;
- int oom_adjust;
+ int oom_adjust = OOM_DISABLE;
+ unsigned long flags;
if (!task)
return -ESRCH;
- oom_adjust = task->oomkilladj;
+
+ if (lock_task_sighand(task, &flags)) {
+ oom_adjust = task->signal->oom_adj;
+ unlock_task_sighand(task, &flags);
+ }
+
put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1015,32 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
- int oom_adjust;
+ char buffer[PROC_NUMBUF];
+ long oom_adjust;
+ unsigned long flags;
+ int err;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- oom_adjust = simple_strtol(buffer, &end, 0);
+
+ err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
+ if (err)
+ return -EINVAL;
if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
oom_adjust != OOM_DISABLE)
return -EINVAL;
- if (*end == '\n')
- end++;
+
task = get_proc_task(file->f_path.dentry->d_inode);
if (!task)
return -ESRCH;
- if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+ if (!lock_task_sighand(task, &flags)) {
+ put_task_struct(task);
+ return -ESRCH;
+ }
+
+ if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+ unlock_task_sighand(task, &flags);
put_task_struct(task);
return -EACCES;
}
- task->oomkilladj = oom_adjust;
+
+ task->signal->oom_adj = oom_adjust;
+
+ unlock_task_sighand(task, &flags);
put_task_struct(task);
- if (end - buffer == 0)
- return -EIO;
- return end - buffer;
+
+ return count;
}
static const struct file_operations proc_oom_adjust_operations = {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 59b43a068872..f06f45b42181 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -328,43 +328,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
return -EFAULT;
} else if (is_vmalloc_addr((void *)start)) {
char * elf_buf;
- struct vm_struct *m;
- unsigned long curstart = start;
- unsigned long cursize = tsz;
elf_buf = kzalloc(tsz, GFP_KERNEL);
if (!elf_buf)
return -ENOMEM;
-
- read_lock(&vmlist_lock);
- for (m=vmlist; m && cursize; m=m->next) {
- unsigned long vmstart;
- unsigned long vmsize;
- unsigned long msize = m->size - PAGE_SIZE;
-
- if (((unsigned long)m->addr + msize) <
- curstart)
- continue;
- if ((unsigned long)m->addr > (curstart +
- cursize))
- break;
- vmstart = (curstart < (unsigned long)m->addr ?
- (unsigned long)m->addr : curstart);
- if (((unsigned long)m->addr + msize) >
- (curstart + cursize))
- vmsize = curstart + cursize - vmstart;
- else
- vmsize = (unsigned long)m->addr +
- msize - vmstart;
- curstart = vmstart + vmsize;
- cursize -= vmsize;
- /* don't dump ioremap'd stuff! (TA) */
- if (m->flags & VM_IOREMAP)
- continue;
- memcpy(elf_buf + (vmstart - start),
- (char *)vmstart, vmsize);
- }
- read_unlock(&vmlist_lock);
+ vread(elf_buf, (char *)start, tsz);
+ /* we have to zero-fill user buffer even if no read */
if (copy_to_user(buffer, elf_buf, tsz)) {
kfree(elf_buf);
return -EFAULT;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d5c410d47fae..171e052c07b3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"Writeback: %8lu kB\n"
"AnonPages: %8lu kB\n"
"Mapped: %8lu kB\n"
+ "Shmem: %8lu kB\n"
"Slab: %8lu kB\n"
"SReclaimable: %8lu kB\n"
"SUnreclaim: %8lu kB\n"
+ "KernelStack: %8lu kB\n"
"PageTables: %8lu kB\n"
#ifdef CONFIG_QUICKLIST
"Quicklists: %8lu kB\n"
@@ -124,10 +126,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
+ K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+ global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
K(global_page_state(NR_PAGETABLE)),
#ifdef CONFIG_QUICKLIST
K(quicklist_total_size()),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2707c6c7a20f..2281c2cbfe2b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -2,6 +2,7 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/ksm.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
@@ -95,6 +96,8 @@ static const struct file_operations proc_kpagecount_operations = {
#define KPF_UNEVICTABLE 18
#define KPF_NOPAGE 20
+#define KPF_KSM 21
+
/* kernel hacking assistances
* WARNING: subject to change, never rely on them!
*/
@@ -137,6 +140,8 @@ static u64 get_uflags(struct page *page)
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
+ if (PageKsm(page))
+ u |= 1 << KPF_KSM;
/*
* compound pages: export both head/tail info
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd8be1d235c..59e98fea34a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -465,6 +465,10 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
+#define CLEAR_REFS_ALL 1
+#define CLEAR_REFS_ANON 2
+#define CLEAR_REFS_MAPPED 3
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -472,13 +476,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
char buffer[PROC_NUMBUF], *end;
struct mm_struct *mm;
struct vm_area_struct *vma;
+ int type;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- if (!simple_strtol(buffer, &end, 0))
+ type = simple_strtol(buffer, &end, 0);
+ if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
return -EINVAL;
if (*end == '\n')
end++;
@@ -494,9 +500,23 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
clear_refs_walk.private = vma;
- if (!is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_start, vma->vm_end,
- &clear_refs_walk);
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ *
+ * Writing 2 to /proc/pid/clear_refs only affects
+ * Anonymous pages.
+ *
+ * Writing 3 to /proc/pid/clear_refs only affects file
+ * mapped pages.
+ */
+ if (type == CLEAR_REFS_ANON && vma->vm_file)
+ continue;
+ if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ continue;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &clear_refs_walk);
}
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 38f7bd559f35..39b49c42a7ed 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info);
/*
* Definitions of diskquota operations.
*/
-struct dquot_operations dquot_operations = {
+const struct dquot_operations dquot_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -2461,7 +2461,7 @@ out:
}
EXPORT_SYMBOL(vfs_set_dqinfo);
-struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops vfs_quotactl_ops = {
.quota_on = vfs_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7adea74d6a8a..f0ad05f38022 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
static int reiserfs_write_info(struct super_block *, int);
static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
-static struct dquot_operations reiserfs_quota_operations = {
+static const struct dquot_operations reiserfs_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -629,7 +629,7 @@ static struct dquot_operations reiserfs_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops reiserfs_qctl_operations = {
+static const struct quotactl_ops reiserfs_qctl_operations = {
.quota_on = reiserfs_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 4ab3c03d8f95..47f132df0c3f 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = {
.readdir = romfs_readdir,
};
-static struct inode_operations romfs_dir_inode_operations = {
+static const struct inode_operations romfs_dir_inode_operations = {
.lookup = romfs_lookup,
};
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cb5fc57e370b..6c197ef53add 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -44,7 +44,7 @@
#include "squashfs.h"
static struct file_system_type squashfs_fs_type;
-static struct super_operations squashfs_super_ops;
+static const struct super_operations squashfs_super_ops;
static int supported_squashfs_filesystem(short major, short minor, short comp)
{
@@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV
};
-static struct super_operations squashfs_super_ops = {
+static const struct super_operations squashfs_super_ops = {
.alloc_inode = squashfs_alloc_inode,
.destroy_inode = squashfs_destroy_inode,
.statfs = squashfs_statfs,
diff --git a/fs/super.c b/fs/super.c
index 9cda337ddae2..0e7207b9815c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock);
static struct super_block *alloc_super(struct file_system_type *type)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
- static struct super_operations default_op;
+ static const struct super_operations default_op;
if (s) {
if (security_sb_alloc(s)) {
@@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+
+ /*
+ * We set the bdi here to the queue backing, file systems can
+ * overwrite this in ->fill_super()
+ */
+ s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
diff --git a/fs/sync.c b/fs/sync.c
index 192340930bb4..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,6 +27,13 @@
*/
static int __sync_filesystem(struct super_block *sb, int wait)
{
+ /*
+ * This should be safe, as we require bdi backing to actually
+ * write out data in the first place
+ */
+ if (!sb->s_bdi)
+ return 0;
+
/* Avoid doing twice syncing and cache pruning for quota sync */
if (!wait) {
writeout_quota_sb(sb, -1);
@@ -101,7 +108,7 @@ restart:
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+ if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
__sync_filesystem(sb, wait);
up_read(&sb->s_umount);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1c8991b0db13..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,29 +54,15 @@
* @nr_to_write: how many dirty pages to write-back
*
* This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
+ * of dirty inodes and their pages.
*
* Note, this function synchronizes even VFS inodes which are locked
* (@i_mutex) by the caller of the budgeting function, because write-back does
* not touch @i_mutex.
*/
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
{
- int nr_written;
-
- nr_written = writeback_inodes_sb(c->vfs_sb);
- if (!nr_written) {
- /*
- * Re-try again but wait on pages/inodes which are being
- * written-back concurrently (e.g., by pdflush).
- */
- nr_written = sync_inodes_sb(c->vfs_sb);
- }
-
- dbg_budg("%d pages were written back", nr_written);
- return nr_written;
+ writeback_inodes_sb(c->vfs_sb);
}
/**
@@ -729,7 +715,7 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
* ubifs_get_free_space - return amount of free space.
* @c: UBIFS file-system description object
*
- * This function calculates and retuns amount of free space to report to
+ * This function calculates and returns amount of free space to report to
* user-space.
*/
long long ubifs_get_free_space(struct ubifs_info *c)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index f3a7945527fb..4775af401167 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -510,7 +510,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
int first = 1, iip;
struct ubifs_debug_info *d = c->dbg;
- union ubifs_key lower_key, upper_key, l_key, u_key;
+ union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
unsigned long long uninitialized_var(last_sqnum);
struct ubifs_idx_node *idx;
struct list_head list;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ce2cd8343618..dbc093afd946 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -210,6 +210,20 @@ const char *dbg_cstate(int cmt_state)
}
}
+const char *dbg_jhead(int jhead)
+{
+ switch (jhead) {
+ case GCHD:
+ return "0 (GC)";
+ case BASEHD:
+ return "1 (base)";
+ case DATAHD:
+ return "2 (data)";
+ default:
+ return "unknown journal head";
+ }
+}
+
static void dump_ch(const struct ubifs_ch *ch)
{
printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
@@ -623,8 +637,9 @@ void dbg_dump_budg(struct ubifs_info *c)
/* If we are in R/O mode, journal heads do not exist */
if (c->jheads)
for (i = 0; i < c->jhead_cnt; i++)
- printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
- c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+ printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
+ dbg_jhead(c->jheads[i].wbuf.jhead),
+ c->jheads[i].wbuf.lnum);
for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
bud = rb_entry(rb, struct ubifs_bud, rb);
printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -648,9 +663,90 @@ void dbg_dump_budg(struct ubifs_info *c)
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
{
- printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
- "flags %#x\n", lp->lnum, lp->free, lp->dirty,
- c->leb_size - lp->free - lp->dirty, lp->flags);
+ int i, spc, dark = 0, dead = 0;
+ struct rb_node *rb;
+ struct ubifs_bud *bud;
+
+ spc = lp->free + lp->dirty;
+ if (spc < c->dead_wm)
+ dead = spc;
+ else
+ dark = ubifs_calc_dark(c, spc);
+
+ if (lp->flags & LPROPS_INDEX)
+ printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
+ lp->dirty, c->leb_size - spc, spc, lp->flags);
+ else
+ printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
+ "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
+ c->leb_size - spc, spc, dark, dead,
+ (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
+
+ if (lp->flags & LPROPS_TAKEN) {
+ if (lp->flags & LPROPS_INDEX)
+ printk(KERN_CONT "index, taken");
+ else
+ printk(KERN_CONT "taken");
+ } else {
+ const char *s;
+
+ if (lp->flags & LPROPS_INDEX) {
+ switch (lp->flags & LPROPS_CAT_MASK) {
+ case LPROPS_DIRTY_IDX:
+ s = "dirty index";
+ break;
+ case LPROPS_FRDI_IDX:
+ s = "freeable index";
+ break;
+ default:
+ s = "index";
+ }
+ } else {
+ switch (lp->flags & LPROPS_CAT_MASK) {
+ case LPROPS_UNCAT:
+ s = "not categorized";
+ break;
+ case LPROPS_DIRTY:
+ s = "dirty";
+ break;
+ case LPROPS_FREE:
+ s = "free";
+ break;
+ case LPROPS_EMPTY:
+ s = "empty";
+ break;
+ case LPROPS_FREEABLE:
+ s = "freeable";
+ break;
+ default:
+ s = NULL;
+ break;
+ }
+ }
+ printk(KERN_CONT "%s", s);
+ }
+
+ for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
+ bud = rb_entry(rb, struct ubifs_bud, rb);
+ if (bud->lnum == lp->lnum) {
+ int head = 0;
+ for (i = 0; i < c->jhead_cnt; i++) {
+ if (lp->lnum == c->jheads[i].wbuf.lnum) {
+ printk(KERN_CONT ", jhead %s",
+ dbg_jhead(i));
+ head = 1;
+ }
+ }
+ if (!head)
+ printk(KERN_CONT ", bud of jhead %s",
+ dbg_jhead(bud->jhead));
+ }
+ }
+ if (lp->lnum == c->gc_lnum)
+ printk(KERN_CONT ", GC LEB");
+ printk(KERN_CONT ")\n");
}
void dbg_dump_lprops(struct ubifs_info *c)
@@ -724,7 +820,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
current->pid, lnum);
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
ubifs_err("scan error %d", (int)PTR_ERR(sleb));
return;
@@ -909,8 +1005,10 @@ out:
ubifs_msg("saved lprops statistics dump");
dbg_dump_lstats(&d->saved_lst);
ubifs_get_lp_stats(c, &lst);
+
ubifs_msg("current lprops statistics dump");
- dbg_dump_lstats(&d->saved_lst);
+ dbg_dump_lstats(&lst);
+
spin_lock(&c->space_lock);
dbg_dump_budg(c);
spin_unlock(&c->space_lock);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index c1cd73b2e06e..29d960101ea6 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -271,6 +271,7 @@ void ubifs_debugging_exit(struct ubifs_info *c);
/* Dump functions */
const char *dbg_ntype(int type);
const char *dbg_cstate(int cmt_state);
+const char *dbg_jhead(int jhead);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
@@ -321,6 +322,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
int dbg_check_lprops(struct ubifs_info *c);
int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
int row, int col);
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+ loff_t size);
/* Force the use of in-the-gaps method for testing */
@@ -425,6 +428,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_ntype(type) ""
#define dbg_cstate(cmt_state) ""
+#define dbg_jhead(jhead) ""
#define dbg_get_key_dump(c, key) ({})
#define dbg_dump_inode(c, inode) ({})
#define dbg_dump_node(c, node) ({})
@@ -460,6 +464,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_check_heap(c, heap, cat, add_pos) ({})
#define dbg_check_lprops(c) 0
#define dbg_check_lpt_nodes(c, cnode, row, col) 0
+#define dbg_check_inode_size(c, inode, size) 0
#define dbg_force_in_the_gaps_enabled 0
#define dbg_force_in_the_gaps() 0
#define dbg_failure_mode 0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6d34dc7e33e1..2e6481a7701c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -21,34 +21,32 @@
*/
/*
- * This file implements VFS file and inode operations of regular files, device
+ * This file implements VFS file and inode operations for regular files, device
* nodes and symlinks as well as address space operations.
*
- * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
- * page is dirty and is used for budgeting purposes - dirty pages should not be
- * budgeted. The PG_checked flag is set if full budgeting is required for the
- * page e.g., when it corresponds to a file hole or it is just beyond the file
- * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
- * fail in this function, and the budget is released in 'ubifs_write_end()'. So
- * the PG_private and PG_checked flags carry the information about how the page
- * was budgeted, to make it possible to release the budget properly.
+ * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
+ * the page is dirty and is used for optimization purposes - dirty pages are
+ * not budgeted so the flag shows that 'ubifs_write_end()' should not release
+ * the budget for this page. The @PG_checked flag is set if full budgeting is
+ * required for the page e.g., when it corresponds to a file hole or it is
+ * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
+ * it is OK to fail in this function, and the budget is released in
+ * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
+ * information about how the page was budgeted, to make it possible to release
+ * the budget properly.
*
- * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
- * we implement. However, this is not true for '->writepage()', which might be
- * called with 'i_mutex' unlocked. For example, when pdflush is performing
- * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
- * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
- * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
- * path'. So, in '->writepage()' we are only guaranteed that the page is
- * locked.
+ * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
+ * implement. However, this is not true for 'ubifs_writepage()', which may be
+ * called with @i_mutex unlocked. For example, when pdflush is doing background
+ * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
+ * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
+ * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
+ * we are only guaranteed that the page is locked.
*
- * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
- * readahead path does not have it locked ("sys_read -> generic_file_aio_read
- * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
- * not set as well. However, UBIFS disables readahead.
- *
- * This, for example means that there might be 2 concurrent '->writepage()'
- * calls for the same inode, but different inode dirty pages.
+ * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
+ * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
+ * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
+ * set as well. However, UBIFS disables readahead.
*/
#include "ubifs.h"
@@ -449,9 +447,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
/*
* We change whole page so no need to load it. But we
* have to set the @PG_checked flag to make the further
- * code the page is new. This might be not true, but it
- * is better to budget more that to read the page from
- * the media.
+ * code know that the page is new. This might be not
+ * true, but it is better to budget more than to read
+ * the page from the media.
*/
SetPageChecked(page);
skipped_read = 1;
@@ -497,8 +495,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
}
/*
- * Whee, we aquired budgeting quickly - without involving
- * garbage-collection, committing or forceing write-back. We return
+ * Whee, we acquired budgeting quickly - without involving
+ * garbage-collection, committing or forcing write-back. We return
* with @ui->ui_mutex locked if we are appending pages, and unlocked
* otherwise. This is an optimization (slightly hacky though).
*/
@@ -562,7 +560,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
/*
* Return 0 to force VFS to repeat the whole operation, or the
- * error code if 'do_readpage()' failes.
+ * error code if 'do_readpage()' fails.
*/
copied = do_readpage(page);
goto out;
@@ -1175,11 +1173,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
- /* The other attributes may be changed at the same time as well */
+ /* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
-
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
mutex_unlock(&ui->ui_mutex);
+
out_budg:
if (budgeted)
ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index f0f5f15d384e..618c2701d3a7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -529,7 +529,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
* We scan the entire LEB even though we only really need to scan up to
* (c->leb_size - lp->free).
*/
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 762a7d6cec73..e589fedaf1ef 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -297,7 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
{
struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
- dbg_io("jhead %d", wbuf->jhead);
+ dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
wbuf->need_sync = 1;
wbuf->c->need_wbuf_sync = 1;
ubifs_wake_up_bgt(wbuf->c);
@@ -314,7 +314,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
if (wbuf->no_timer)
return;
- dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead,
+ dbg_io("set timer for jhead %s, %llu-%llu millisecs",
+ dbg_jhead(wbuf->jhead),
div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
USEC_PER_SEC));
@@ -351,8 +352,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
/* Write-buffer is empty or not seeked */
return 0;
- dbg_io("LEB %d:%d, %d bytes, jhead %d",
- wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead);
+ dbg_io("LEB %d:%d, %d bytes, jhead %s",
+ wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
ubifs_assert(!(wbuf->avail & 7));
ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -401,7 +402,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
{
const struct ubifs_info *c = wbuf->c;
- dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead);
+ dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
ubifs_assert(offs >= 0 && offs <= c->leb_size);
ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -508,9 +509,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
struct ubifs_info *c = wbuf->c;
int err, written, n, aligned_len = ALIGN(len, 8), offs;
- dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len,
- dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead,
- wbuf->lnum, wbuf->offs + wbuf->used);
+ dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
+ dbg_ntype(((struct ubifs_ch *)buf)->node_type),
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -535,8 +536,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
memcpy(wbuf->buf + wbuf->used, buf, len);
if (aligned_len == wbuf->avail) {
- dbg_io("flush jhead %d wbuf to LEB %d:%d",
- wbuf->jhead, wbuf->lnum, wbuf->offs);
+ dbg_io("flush jhead %s wbuf to LEB %d:%d",
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
wbuf->offs, c->min_io_size,
wbuf->dtype);
@@ -564,8 +565,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
* minimal I/O unit. We have to fill and flush write-buffer and switch
* to the next min. I/O unit.
*/
- dbg_io("flush jhead %d wbuf to LEB %d:%d",
- wbuf->jhead, wbuf->lnum, wbuf->offs);
+ dbg_io("flush jhead %s wbuf to LEB %d:%d",
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
c->min_io_size, wbuf->dtype);
@@ -698,8 +699,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
int err, rlen, overlap;
struct ubifs_ch *ch = buf;
- dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs,
- dbg_ntype(type), len, wbuf->jhead);
+ dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
+ dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
ubifs_assert(!(offs & 7) && offs < c->leb_size);
ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 64b5f3a309f5..d321baeca68d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -158,7 +158,7 @@ again:
* some. But the write-buffer mutex has to be unlocked because
* GC also takes it.
*/
- dbg_jnl("no free space jhead %d, run GC", jhead);
+ dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
mutex_unlock(&wbuf->io_mutex);
lnum = ubifs_garbage_collect(c, 0);
@@ -173,7 +173,8 @@ again:
* because we dropped @wbuf->io_mutex, so try once
* again.
*/
- dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
+ dbg_jnl("GC couldn't make a free LEB for jhead %s",
+ dbg_jhead(jhead));
if (retries++ < 2) {
dbg_jnl("retry (%d)", retries);
goto again;
@@ -184,7 +185,7 @@ again:
}
mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
- dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
+ dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
avail = c->leb_size - wbuf->offs - wbuf->used;
if (wbuf->lnum != -1 && avail >= len) {
@@ -255,7 +256,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
*lnum = c->jheads[jhead].wbuf.lnum;
*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
- dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ dbg_jnl("jhead %s, LEB %d:%d, len %d",
+ dbg_jhead(jhead), *lnum, *offs, len);
ubifs_prepare_node(c, node, len, 0);
return ubifs_wbuf_write_nolock(wbuf, node, len);
@@ -285,7 +287,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
*lnum = c->jheads[jhead].wbuf.lnum;
*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
- dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ dbg_jnl("jhead %s, LEB %d:%d, len %d",
+ dbg_jhead(jhead), *lnum, *offs, len);
err = ubifs_wbuf_write_nolock(wbuf, buf, len);
if (err)
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 5fa27ea031ba..0f530c684f0b 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -229,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c,
}
/**
- * xent_key_init_hash - initialize extended attribute entry key without
- * re-calculating hash function.
- * @c: UBIFS file-system description object
- * @key: key to initialize
- * @inum: host inode number
- * @hash: extended attribute entry name hash
- */
-static inline void xent_key_init_hash(const struct ubifs_info *c,
- union ubifs_key *key, ino_t inum,
- uint32_t hash)
-{
- ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
- key->u32[0] = inum;
- key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
-}
-
-/**
* xent_key_init_flash - initialize on-flash extended attribute entry key.
* @c: UBIFS file-system description object
* @k: key to initialize
@@ -295,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c,
}
/**
- * data_key_init_flash - initialize on-flash data key.
+ * highest_data_key - get the highest possible data key for an inode.
* @c: UBIFS file-system description object
- * @k: key to initialize
+ * @key: key to initialize
* @inum: inode number
- * @block: block number
*/
-static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
- ino_t inum, unsigned int block)
+static inline void highest_data_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
{
- union ubifs_key *key = k;
-
- ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
- key->j32[0] = cpu_to_le32(inum);
- key->j32[1] = cpu_to_le32(block |
- (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
- memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+ data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
}
/**
@@ -554,4 +530,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
return 0;
}
}
+
#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 56e33772a1ee..c345e125f42c 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -169,8 +169,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
*/
c->bud_bytes += c->leb_size - bud->start;
- dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
- bud->start, bud->jhead, c->bud_bytes);
+ dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
+ bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
spin_unlock(&c->buds_lock);
}
@@ -355,16 +355,16 @@ static void remove_buds(struct ubifs_info *c)
* heads (non-closed buds).
*/
c->cmt_bud_bytes += wbuf->offs - bud->start;
- dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
+ dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
"cmt_bud_bytes %lld", bud->lnum, bud->start,
- bud->jhead, wbuf->offs - bud->start,
+ dbg_jhead(bud->jhead), wbuf->offs - bud->start,
c->cmt_bud_bytes);
bud->start = wbuf->offs;
} else {
c->cmt_bud_bytes += c->leb_size - bud->start;
- dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
+ dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
"cmt_bud_bytes %lld", bud->lnum, bud->start,
- bud->jhead, c->leb_size - bud->start,
+ dbg_jhead(bud->jhead), c->leb_size - bud->start,
c->cmt_bud_bytes);
rb_erase(p1, &c->buds);
/*
@@ -429,7 +429,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
if (lnum == -1 || offs == c->leb_size)
continue;
- dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
+ dbg_log("add ref to LEB %d:%d for jhead %s",
+ lnum, offs, dbg_jhead(i));
ref = buf + len;
ref->ch.node_type = UBIFS_REF_NODE;
ref->lnum = cpu_to_le32(lnum);
@@ -695,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
lnum = c->ltail_lnum;
write_lnum = lnum;
while (1) {
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
goto out_free;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4cdd284dea56..4d4ca388889b 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -281,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
case LPROPS_FREE:
if (add_to_lpt_heap(c, lprops, cat))
break;
- /* No more room on heap so make it uncategorized */
+ /* No more room on heap so make it un-categorized */
cat = LPROPS_UNCAT;
/* Fall through */
case LPROPS_UNCAT:
@@ -375,8 +375,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
* @lprops: LEB properties
*
* A LEB may have fallen off of the bottom of a heap, and ended up as
- * uncategorized even though it has enough space for us now. If that is the case
- * this function will put the LEB back onto a heap.
+ * un-categorized even though it has enough space for us now. If that is the
+ * case this function will put the LEB back onto a heap.
*/
void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
{
@@ -436,10 +436,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c,
/**
* change_category - change LEB properties category.
* @c: UBIFS file-system description object
- * @lprops: LEB properties to recategorize
+ * @lprops: LEB properties to re-categorize
*
* LEB properties are categorized to enable fast find operations. When the LEB
- * properties change they must be recategorized.
+ * properties change they must be re-categorized.
*/
static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
{
@@ -461,21 +461,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
}
/**
- * calc_dark - calculate LEB dark space size.
+ * ubifs_calc_dark - calculate LEB dark space size.
* @c: the UBIFS file-system description object
* @spc: amount of free and dirty space in the LEB
*
- * This function calculates amount of dark space in an LEB which has @spc bytes
- * of free and dirty space. Returns the calculations result.
+ * This function calculates and returns amount of dark space in an LEB which
+ * has @spc bytes of free and dirty space.
*
- * Dark space is the space which is not always usable - it depends on which
- * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
- * it is dark space, because it cannot fit a large data node. So UBIFS cannot
- * count on this LEB and treat these 512 bytes as usable because it is not true
- * if, for example, only big chunks of uncompressible data will be written to
- * the FS.
+ * UBIFS is trying to account the space which might not be usable, and this
+ * space is called "dark space". For example, if an LEB has only %512 free
+ * bytes, it is dark space, because it cannot fit a large data node.
*/
-static int calc_dark(struct ubifs_info *c, int spc)
+int ubifs_calc_dark(const struct ubifs_info *c, int spc)
{
ubifs_assert(!(spc & 7));
@@ -518,7 +515,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
* @free: new free space amount
* @dirty: new dirty space amount
* @flags: new flags
- * @idx_gc_cnt: change to the count of idx_gc list
+ * @idx_gc_cnt: change to the count of @idx_gc list
*
* This function changes LEB properties (@free, @dirty or @flag). However, the
* property which has the %LPROPS_NC value is not changed. Returns a pointer to
@@ -535,7 +532,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
{
/*
* This is the only function that is allowed to change lprops, so we
- * discard the const qualifier.
+ * discard the "const" qualifier.
*/
struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
@@ -575,7 +572,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
if (old_spc < c->dead_wm)
c->lst.total_dead -= old_spc;
else
- c->lst.total_dark -= calc_dark(c, old_spc);
+ c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
c->lst.total_used -= c->leb_size - old_spc;
}
@@ -616,7 +613,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
if (new_spc < c->dead_wm)
c->lst.total_dead += new_spc;
else
- c->lst.total_dark += calc_dark(c, new_spc);
+ c->lst.total_dark += ubifs_calc_dark(c, new_spc);
c->lst.total_used += c->leb_size - new_spc;
}
@@ -1096,7 +1093,7 @@ static int scan_check_cb(struct ubifs_info *c,
}
}
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
/*
* After an unclean unmount, empty and freeable LEBs
@@ -1107,7 +1104,7 @@ static int scan_check_cb(struct ubifs_info *c,
"- continuing checking");
lst->empty_lebs += 1;
lst->total_free += c->leb_size;
- lst->total_dark += calc_dark(c, c->leb_size);
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
return LPT_SCAN_CONTINUE;
}
@@ -1117,7 +1114,7 @@ static int scan_check_cb(struct ubifs_info *c,
"- continuing checking");
lst->total_free += lp->free;
lst->total_dirty += lp->dirty;
- lst->total_dark += calc_dark(c, c->leb_size);
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
return LPT_SCAN_CONTINUE;
}
data->err = PTR_ERR(sleb);
@@ -1235,7 +1232,7 @@ static int scan_check_cb(struct ubifs_info *c,
if (spc < c->dead_wm)
lst->total_dead += spc;
else
- lst->total_dark += calc_dark(c, spc);
+ lst->total_dark += ubifs_calc_dark(c, spc);
}
ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index a88f33801b98..28beaeedadc0 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -29,7 +29,8 @@
* @c: UBIFS file-system description object
*
* This function scans the master node LEBs and search for the latest master
- * node. Returns zero in case of success and a negative error code in case of
+ * node. Returns zero in case of success, %-EUCLEAN if there master area is
+ * corrupted and requires recovery, and a negative error code in case of
* failure.
*/
static int scan_for_master(struct ubifs_info *c)
@@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c)
lnum = UBIFS_MST_LNUM;
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
nodes_cnt = sleb->nodes_cnt;
@@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c)
snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
list);
if (snod->type != UBIFS_MST_NODE)
- goto out;
+ goto out_dump;
memcpy(c->mst_node, snod->node, snod->len);
offs = snod->offs;
}
@@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c)
lnum += 1;
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
if (sleb->nodes_cnt != nodes_cnt)
@@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c)
goto out;
snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
if (snod->type != UBIFS_MST_NODE)
- goto out;
+ goto out_dump;
if (snod->offs != offs)
goto out;
if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
@@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c)
out:
ubifs_scan_destroy(sleb);
+ return -EUCLEAN;
+
+out_dump:
+ ubifs_err("unexpected node type %d master LEB %d:%d",
+ snod->type, lnum, snod->offs);
+ ubifs_scan_destroy(sleb);
return -EINVAL;
}
@@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c)
err = scan_for_master(c);
if (err) {
- err = ubifs_recover_master_node(c);
+ if (err == -EUCLEAN)
+ err = ubifs_recover_master_node(c);
if (err)
/*
* Note, we do not free 'c->mst_node' here because the
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 152a7b34a141..82009c74b6a3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -670,9 +670,10 @@ static int kill_orphans(struct ubifs_info *c)
struct ubifs_scan_leb *sleb;
dbg_rcvry("LEB %d", lnum);
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb)) {
- sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+ if (PTR_ERR(sleb) == -EUCLEAN)
+ sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
@@ -899,7 +900,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
struct ubifs_scan_leb *sleb;
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index e5f6cf8a1155..f94ddf7efba0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -286,7 +286,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
mst = mst2;
}
- dbg_rcvry("recovered master node from LEB %d",
+ ubifs_msg("recovered master node from LEB %d",
(mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
@@ -790,7 +790,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
* We can only recover at the end of the log, so check that the
* next log LEB is empty or out of date.
*/
- sleb = ubifs_scan(c, next_lnum, 0, sbuf);
+ sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
if (IS_ERR(sleb))
return sleb;
if (sleb->nodes_cnt) {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 2970500f32df..5c2d6d759a3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -506,7 +506,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
if (c->need_recovery)
sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
else
- sleb = ubifs_scan(c, lnum, offs, c->sbuf);
+ sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
@@ -836,8 +836,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
const struct ubifs_cs_node *node;
dbg_mnt("replay log LEB %d:%d", lnum, offs);
- sleb = ubifs_scan(c, lnum, offs, sbuf);
- if (IS_ERR(sleb) ) {
+ sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
+ if (IS_ERR(sleb)) {
if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
return PTR_ERR(sleb);
sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 892ebfee4fe5..96c525384191 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -108,10 +108,9 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
/* Make the node pads to 8-byte boundary */
if ((node_len + pad_len) & 7) {
- if (!quiet) {
+ if (!quiet)
dbg_err("bad padding length %d - %d",
offs, offs + node_len + pad_len);
- }
return SCANNED_A_BAD_PAD_NODE;
}
@@ -253,15 +252,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
* @c: UBIFS file-system description object
* @lnum: logical eraseblock number
* @offs: offset to start at (usually zero)
- * @sbuf: scan buffer (must be c->leb_size)
+ * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
+ * @quiet: print no messages
*
* This function scans LEB number @lnum and returns complete information about
* its contents. Returns the scaned information in case of success and,
* %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
* of failure.
+ *
+ * If @quiet is non-zero, this function does not print large and scary
+ * error messages and flash dumps in case of errors.
*/
struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
- int offs, void *sbuf)
+ int offs, void *sbuf, int quiet)
{
void *buf = sbuf + offs;
int err, len = c->leb_size - offs;
@@ -280,7 +283,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
cond_resched();
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
if (ret > 0) {
/* Padding bytes or a valid padding node */
offs += ret;
@@ -320,7 +323,9 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
}
if (offs % c->min_io_size) {
- ubifs_err("empty space starts at non-aligned offset %d", offs);
+ if (!quiet)
+ ubifs_err("empty space starts at non-aligned offset %d",
+ offs);
goto corrupted;;
}
@@ -331,18 +336,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
break;
for (; len; offs++, buf++, len--)
if (*(uint8_t *)buf != 0xff) {
- ubifs_err("corrupt empty space at LEB %d:%d",
- lnum, offs);
+ if (!quiet)
+ ubifs_err("corrupt empty space at LEB %d:%d",
+ lnum, offs);
goto corrupted;
}
return sleb;
corrupted:
- ubifs_scanned_corruption(c, lnum, offs, buf);
+ if (!quiet) {
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ ubifs_err("LEB %d scanning failed", lnum);
+ }
err = -EUCLEAN;
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+
error:
- ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_err("LEB %d scanning failed, error %d", lnum, err);
ubifs_scan_destroy(sleb);
return ERR_PTR(err);
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 51763aa8f4de..333e181ee987 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,7 +36,6 @@
#include <linux/mount.h>
#include <linux/math64.h>
#include <linux/writeback.h>
-#include <linux/smp_lock.h>
#include "ubifs.h"
/*
@@ -318,6 +317,8 @@ static int ubifs_write_inode(struct inode *inode, int wait)
if (err)
ubifs_err("can't write inode %lu, error %d",
inode->i_ino, err);
+ else
+ err = dbg_check_inode_size(c, inode, ui->ui_size);
}
ui->dirty = 0;
@@ -448,17 +449,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
return 0;
/*
- * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
- * pages, so synchronize them first, then commit the journal. Strictly
- * speaking, it is not necessary to commit the journal here,
- * synchronizing write-buffers would be enough. But committing makes
- * UBIFS free space predictions much more accurate, so we want to let
- * the user be able to get more accurate results of 'statfs()' after
- * they synchronize the file system.
- */
- sync_inodes_sb(sb);
-
- /*
* Synchronize write buffers, because 'ubifs_run_commit()' does not
* do this if it waits for an already running commit.
*/
@@ -468,6 +458,13 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
return err;
}
+ /*
+ * Strictly speaking, it is not necessary to commit the journal here,
+ * synchronizing write-buffers would be enough. But committing makes
+ * UBIFS free space predictions much more accurate, so we want to let
+ * the user be able to get more accurate results of 'statfs()' after
+ * they synchronize the file system.
+ */
err = ubifs_run_commit(c);
if (err)
return err;
@@ -1720,8 +1717,6 @@ static void ubifs_put_super(struct super_block *sb)
ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
c->vi.vol_id);
- lock_kernel();
-
/*
* The following asserts are only valid if there has not been a failure
* of the media. For example, there will be dirty inodes if we failed
@@ -1786,8 +1781,6 @@ static void ubifs_put_super(struct super_block *sb)
ubi_close_volume(c->ubi);
mutex_unlock(&c->umount_mutex);
kfree(c);
-
- unlock_kernel();
}
static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1803,22 +1796,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
return err;
}
- lock_kernel();
if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
if (c->ro_media) {
ubifs_msg("cannot re-mount due to prior errors");
- unlock_kernel();
return -EROFS;
}
err = ubifs_remount_rw(c);
- if (err) {
- unlock_kernel();
+ if (err)
return err;
- }
} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
if (c->ro_media) {
ubifs_msg("cannot re-mount due to prior errors");
- unlock_kernel();
return -EROFS;
}
ubifs_remount_ro(c);
@@ -1833,7 +1821,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
}
ubifs_assert(c->lst.taken_empty_lebs > 0);
- unlock_kernel();
return 0;
}
@@ -1980,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto out_bdi;
+ sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f249f7b0d656..e5b1a7d00fa0 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1159,8 +1159,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
* o exact match, i.e. the found zero-level znode contains key @key, then %1
* is returned and slot number of the matched branch is stored in @n;
* o not exact match, which means that zero-level znode does not contain
- * @key, then %0 is returned and slot number of the closed branch is stored
- * in @n;
+ * @key, then %0 is returned and slot number of the closest branch is stored
+ * in @n;
* o @key is so small that it is even less than the lowest key of the
* leftmost zero-level node, then %0 is returned and %0 is stored in @n.
*
@@ -1433,7 +1433,7 @@ static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
* @lnum: LEB number is returned here
* @offs: offset is returned here
*
- * This function look up and reads node with key @key. The caller has to make
+ * This function looks up and reads node with key @key. The caller has to make
* sure the @node buffer is large enough to fit the node. Returns zero in case
* of success, %-ENOENT if the node was not found, and a negative error code in
* case of failure. The node location can be returned in @lnum and @offs.
@@ -3268,3 +3268,73 @@ out_unlock:
mutex_unlock(&c->tnc_mutex);
return err;
}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_inode_size - check if inode size is correct.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @size: inode size
+ *
+ * This function makes sure that the inode size (@size) is correct and it does
+ * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
+ * if it has a data page beyond @size, and other negative error code in case of
+ * other errors.
+ */
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+ loff_t size)
+{
+ int err, n;
+ union ubifs_key from_key, to_key, *key;
+ struct ubifs_znode *znode;
+ unsigned int block;
+
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+
+ block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+ data_key_init(c, &from_key, inode->i_ino, block);
+ highest_data_key(c, &to_key, inode->i_ino);
+
+ mutex_lock(&c->tnc_mutex);
+ err = ubifs_lookup_level0(c, &from_key, &znode, &n);
+ if (err < 0)
+ goto out_unlock;
+
+ if (err) {
+ err = -EINVAL;
+ key = &from_key;
+ goto out_dump;
+ }
+
+ err = tnc_next(c, &znode, &n);
+ if (err == -ENOENT) {
+ err = 0;
+ goto out_unlock;
+ }
+ if (err < 0)
+ goto out_unlock;
+
+ ubifs_assert(err == 0);
+ key = &znode->zbranch[n].key;
+ if (!key_in_range(c, key, &from_key, &to_key))
+ goto out_unlock;
+
+out_dump:
+ block = key_block(c, key);
+ ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
+ "(data key %s)", (unsigned long)inode->i_ino, size,
+ ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+ dbg_dump_inode(c, inode);
+ dbg_dump_stack();
+ err = -EINVAL;
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index fde8d127c768..53288e5d604e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -245,7 +245,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
* it is more comprehensive and less efficient than is needed for this
* purpose.
*/
- sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
+ sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
c->ileb_len = 0;
if (IS_ERR(sleb))
return PTR_ERR(sleb);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 3eee07e0c495..191ca7863fe7 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -135,6 +135,13 @@
/* The key is always at the same position in all keyed nodes */
#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+/* Garbage collector journal head number */
+#define UBIFS_GC_HEAD 0
+/* Base journal head number */
+#define UBIFS_BASE_HEAD 1
+/* Data journal head number */
+#define UBIFS_DATA_HEAD 2
+
/*
* LEB Properties Tree node types.
*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a29349094422..b2d976366a46 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -105,12 +105,10 @@
/* Number of non-data journal heads */
#define NONDATA_JHEADS_CNT 2
-/* Garbage collector head */
-#define GCHD 0
-/* Base journal head number */
-#define BASEHD 1
-/* First "general purpose" journal head */
-#define DATAHD 2
+/* Shorter names for journal head numbers for internal usage */
+#define GCHD UBIFS_GC_HEAD
+#define BASEHD UBIFS_BASE_HEAD
+#define DATAHD UBIFS_DATA_HEAD
/* 'No change' value for 'ubifs_change_lp()' */
#define LPROPS_NC 0x80000001
@@ -1451,7 +1449,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
/* scan.c */
struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
- int offs, void *sbuf);
+ int offs, void *sbuf, int quiet);
void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
int offs, int quiet);
@@ -1676,6 +1674,7 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index adafcf556531..195830f47569 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -78,9 +78,9 @@ enum {
SECURITY_XATTR,
};
-static struct inode_operations none_inode_operations;
-static struct address_space_operations none_address_operations;
-static struct file_operations none_file_operations;
+static const struct inode_operations none_inode_operations;
+static const struct address_space_operations none_address_operations;
+static const struct file_operations none_file_operations;
/**
* create_xattr - create an extended attribute.
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
if (ip->i_d.di_size < isize) {
ip->i_d.di_size = isize;
ip->i_update_core = 1;
- ip->i_update_size = 1;
xfs_mark_inode_dirty_sync(ip);
}
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
*/
STATIC int
xfs_file_fsync(
- struct file *filp,
- struct dentry *dentry,
- int datasync)
+ struct file *file,
+ struct dentry *dentry,
+ int datasync)
{
- xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
- return -xfs_fsync(XFS_I(dentry->d_inode));
+ struct inode *inode = dentry->d_inode;
+ struct xfs_inode *ip = XFS_I(inode);
+ int error;
+
+ /* capture size updates in I/O completion before writing the inode. */
+ error = filemap_fdatawait(inode->i_mapping);
+ if (error)
+ return error;
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ return -xfs_fsync(ip);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 6c32f1d63d8c..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
#include "xfs_error.h"
#include "xfs_itable.h"
#include "xfs_rw.h"
-#include "xfs_acl.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_utils.h"
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index fde63a3c4ecc..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,19 +812,21 @@ write_retry:
/* Handle various SYNC-type writes */
if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+ loff_t end = pos + ret - 1;
int error2;
xfs_iunlock(xip, iolock);
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
- error2 = filemap_write_and_wait_range(mapping, pos,
- pos + ret - 1);
+
+ error2 = filemap_write_and_wait_range(mapping, pos, end);
if (!error)
error = error2;
if (need_i_mutex)
mutex_lock(&inode->i_mutex);
xfs_ilock(xip, iolock);
- error2 = xfs_write_sync_logforce(mp, xip);
+
+ error2 = xfs_fsync(xip);
if (!error)
error = error2;
}
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index cb6e2cca214f..9e41f91aa269 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -150,7 +150,7 @@ xfs_fs_set_xquota(
return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
}
-struct quotactl_ops xfs_quotactl_operations = {
+const struct quotactl_ops xfs_quotactl_operations = {
.quota_sync = xfs_fs_quota_sync,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
DEFINE_PER_CPU(struct xfsstats, xfsstats);
-STATIC int
-xfs_read_xfsstats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
{
- int c, i, j, len, val;
+ int c, i, j, val;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
};
/* Loop over all stats groups */
- for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
- len += sprintf(buffer + len, "%s", xstats[i].desc);
+ for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+ seq_printf(m, "%s", xstats[i].desc);
/* inner loop does each group */
while (j < xstats[i].endpoint) {
val = 0;
/* sum over all cpus */
for_each_possible_cpu(c)
val += *(((__u32*)&per_cpu(xfsstats, c) + j));
- len += sprintf(buffer + len, " %u", val);
+ seq_printf(m, " %u", val);
j++;
}
- buffer[len++] = '\n';
+ seq_putc(m, '\n');
}
/* extra precision counters */
for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
}
- len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+ seq_printf(m, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += sprintf(buffer + len, "debug %u\n",
+ seq_printf(m, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
+ return 0;
+}
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xfs_stat_proc_show, NULL);
}
+static const struct file_operations xfs_stat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xfs_stat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
goto out;
- if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
- xfs_read_xfsstats, NULL))
+ if (!proc_create("fs/xfs/stat", 0, NULL,
+ &xfs_stat_proc_fops))
goto out_remove_entry;
return 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..bdd41c8c342f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -67,7 +67,7 @@
#include <linux/freezer.h>
#include <linux/parser.h>
-static struct super_operations xfs_super_operations;
+static const struct super_operations xfs_super_operations;
static kmem_zone_t *xfs_ioend_zone;
mempool_t *xfs_ioend_pool;
@@ -579,15 +579,19 @@ xfs_showargs(
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
seq_puts(m, "," MNTOPT_UQUOTANOENF);
- if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_PRJQUOTA);
- else if (mp->m_qflags & XFS_PQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_PQUOTANOENF);
-
- if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_GRPQUOTA);
- else if (mp->m_qflags & XFS_GQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ /* Either project or group quotas can be active, not both */
+
+ if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_PRJQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_PQUOTANOENF);
+ } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_GRPQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ }
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
return error;
}
-void
+STATIC void
xfs_mountfs_check_barriers(xfs_mount_t *mp)
{
int error;
@@ -1532,7 +1536,7 @@ xfs_fs_get_sb(
mnt);
}
-static struct super_operations xfs_super_operations = {
+static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
.write_inode = xfs_fs_write_inode,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 5a2ea3a21781..18175ebd58ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
extern struct xattr_handler *xfs_xattr_handlers[];
-extern struct quotactl_ops xfs_quotactl_operations;
+extern const struct quotactl_ops xfs_quotactl_operations;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
}
-void
-xfs_inode_clear_reclaim_tag(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
-
- read_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
-}
-
STATIC int
xfs_reclaim_inode_now(
struct xfs_inode *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
struct xqmstats xqmstats;
-STATIC int
-xfs_qm_read_xfsquota(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xqm_proc_show(struct seq_file *m, void *v)
{
- int len;
-
/* maximum; incore; ratio free to inuse; freelist */
- len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+ seq_printf(m, "%d\t%d\t%d\t%u\n",
ndquot,
xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
-
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+ return 0;
}
-STATIC int
-xfs_qm_read_stats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xqm_proc_open(struct inode *inode, struct file *file)
{
- int len;
+ return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqm_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
/* quota performance statistics */
- len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+ seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
xqmstats.xs_qm_dqreclaims,
xqmstats.xs_qm_dqreclaim_misses,
xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
xqmstats.xs_qm_dqwants,
xqmstats.xs_qm_dqshake_reclaims,
xqmstats.xs_qm_dqinact_reclaims);
+ return 0;
+}
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqmstat_proc_show, NULL);
}
+static const struct file_operations xqmstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqmstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
void
xfs_qm_init_procfs(void)
{
- create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
- create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+ proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+ proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
}
void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
xfs_agino_t pagi_count; /* number of allocated inodes */
int pagb_count; /* pagb slots in use */
xfs_perag_busy_t *pagb_list; /* unstable blocks */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
#ifdef __KERNEL__
spinlock_t pagb_lock; /* lock for pagb_list */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
* entry (null if none). Else, *lastxp will be set to the index
* of the found entry; *gotp will contain the entry.
*/
-xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
xfs_bmap_search_multi_extents(
xfs_ifork_t *ifp, /* inode fork pointer */
xfs_fileoff_t bno, /* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
int whichfork,
int *count);
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry. If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none). Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *
-xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
- xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
-
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
ext_flag);
}
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
-{
- __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
- get_unaligned_be64(&r->l1), s);
-}
-
/*
* Extract the blockcount field from an on disk bmap extent record.
*/
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
*l1 = 0;
}
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+ xfs_bmbt_rec_t *r,
+ xfs_bmbt_irec_t *s)
+{
+ __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+ get_unaligned_be64(&r->l1), s);
+}
+
STATIC void
xfs_bmbt_trace_record(
struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
}
/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- xfs_buf_t **bpp, /* buffer for agno/agbno */
- int refval) /* ref count value for buffer */
-{
- xfs_buf_t *bp; /* return value */
- xfs_daddr_t d; /* real disk block address */
- int error;
-
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp))) {
- return error;
- }
- ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (bp != NULL) {
- switch (refval) {
- case XFS_ALLOC_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
- break;
- case XFS_INO_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
- break;
- }
- }
- *bpp = bp;
- return 0;
-}
-
-/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Long-form addressing.
*/
@@ -2951,7 +2911,7 @@ error0:
* inode we have to copy the single block it was pointing to into the
* inode.
*/
-int
+STATIC int
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
int refval);/* ref count value for buffer */
/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- struct xfs_buf **bpp, /* buffer for agno/agbno */
- int refval);/* ref count value for buffer */
-
-/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Long-form addressing.
*/
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_kill_iroot(struct xfs_btree_cur *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
}
/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-STATIC int /* error */
-xfs_inobt_lookup_eq(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
*/
int /* error */
-xfs_inobt_lookup_ge(
+xfs_inobt_lookup(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
+ xfs_lookup_t dir, /* <=, >=, == */
int *stat) /* success/failure */
{
cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ cur->bc_rec.i.ir_freecount = 0;
+ cur->bc_rec.i.ir_free = 0;
+ return xfs_btree_lookup(cur, dir, stat);
}
/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_le(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-}
-
-/*
- * Update the record referred to by cur to the value given
- * by [ino, fcnt, free].
+ * Update the record referred to by cur to the value given.
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
STATIC int /* error */
xfs_inobt_update(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free) /* free inode mask */
+ xfs_inobt_rec_incore_t *irec) /* btree record */
{
union xfs_btree_rec rec;
- rec.inobt.ir_startino = cpu_to_be32(ino);
- rec.inobt.ir_freecount = cpu_to_be32(fcnt);
- rec.inobt.ir_free = cpu_to_be64(free);
+ rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+ rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
return xfs_btree_update(cur, &rec);
}
@@ -135,9 +95,7 @@ xfs_inobt_update(
int /* error */
xfs_inobt_get_rec(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t *ino, /* output: starting inode of chunk */
- __int32_t *fcnt, /* output: number of free inodes */
- xfs_inofree_t *free, /* output: free inode mask */
+ xfs_inobt_rec_incore_t *irec, /* btree record */
int *stat) /* output: success/failure */
{
union xfs_btree_rec *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
error = xfs_btree_get_rec(cur, &rec, stat);
if (!error && *stat == 1) {
- *ino = be32_to_cpu(rec->inobt.ir_startino);
- *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
- *free = be64_to_cpu(rec->inobt.ir_free);
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
return error;
}
/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+ struct xfs_btree_cur *cur,
+ struct xfs_agi *agi)
+{
+ if (cur->bc_nlevels == 1) {
+ xfs_inobt_rec_incore_t rec;
+ int freecount = 0;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+
+ do {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+
+ if (i) {
+ freecount += rec.ir_freecount;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ return error;
+ }
+ } while (i == 1);
+
+ if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+ ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ }
+ return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi) 0
+#endif
+
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t length,
+ unsigned int gen)
+{
+ struct xfs_buf *fbuf;
+ struct xfs_dinode *free;
+ int blks_per_cluster, nbufs, ninodes;
+ int version;
+ int i, j;
+ xfs_daddr_t d;
+
+ /*
+ * Loop over the new block(s), filling in the inodes.
+ * For small block sizes, manipulate the inodes in buffers
+ * which are multiples of the blocks size.
+ */
+ if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+ blks_per_cluster = 1;
+ nbufs = length;
+ ninodes = mp->m_sb.sb_inopblock;
+ } else {
+ blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+ mp->m_sb.sb_blocksize;
+ nbufs = length / blks_per_cluster;
+ ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+ }
+
+ /*
+ * Figure out what version number to use in the inodes we create.
+ * If the superblock version has caught up to the one that supports
+ * the new inode format, then use the new inode version. Otherwise
+ * use the old version so that old kernels will continue to be
+ * able to use the file system.
+ */
+ if (xfs_sb_version_hasnlink(&mp->m_sb))
+ version = 2;
+ else
+ version = 1;
+
+ for (j = 0; j < nbufs; j++) {
+ /*
+ * Get the block.
+ */
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * blks_per_cluster,
+ XFS_BUF_LOCK);
+ ASSERT(fbuf);
+ ASSERT(!XFS_BUF_GETERROR(fbuf));
+
+ /*
+ * Initialize all inodes in this buffer and then log them.
+ *
+ * XXX: It would be much better if we had just one transaction
+ * to log a whole cluster of inodes instead of all the
+ * individual transactions causing a lot of log traffic.
+ */
+ xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+ for (i = 0; i < ninodes; i++) {
+ int ioffset = i << mp->m_sb.sb_inodelog;
+ uint isize = sizeof(struct xfs_dinode);
+
+ free = xfs_make_iptr(mp, fbuf, i);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
+ free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+ }
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ }
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- int blks_per_cluster; /* fs blocks per inode cluster */
xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_daddr_t d; /* disk addr of buffer */
xfs_agnumber_t agno;
int error;
- xfs_buf_t *fbuf; /* new free inodes' buffer */
- xfs_dinode_t *free; /* new free inode structure */
- int i; /* inode counter */
- int j; /* block counter */
- int nbufs; /* num bufs of new inodes */
+ int i;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- int ninodes; /* num inodes per buf */
xfs_agino_t thisino; /* current inode number, for loop */
- int version; /* inode version number to use */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
- unsigned int gen;
args.tp = tp;
args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
*/
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
+ agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
XFS_IALLOC_BLOCKS(args.mp);
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
args.mod = args.total = args.wasdel = args.isfl =
args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
* For now, just allocate blocks up front.
*/
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
/*
* Allocate a fixed-size extent of inodes.
*/
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.alignment = xfs_ialloc_cluster_alignment(&args);
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
return 0;
}
ASSERT(args.len == args.minlen);
- /*
- * Convert the results.
- */
- newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
- /*
- * Loop over the new block(s), filling in the inodes.
- * For small block sizes, manipulate the inodes in buffers
- * which are multiples of the blocks size.
- */
- if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
- blks_per_cluster = 1;
- nbufs = (int)args.len;
- ninodes = args.mp->m_sb.sb_inopblock;
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
- args.mp->m_sb.sb_blocksize;
- nbufs = (int)args.len / blks_per_cluster;
- ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
- }
- /*
- * Figure out what version number to use in the inodes we create.
- * If the superblock version has caught up to the one that supports
- * the new inode format, then use the new inode version. Otherwise
- * use the old version so that old kernels will continue to be
- * able to use the file system.
- */
- if (xfs_sb_version_hasnlink(&args.mp->m_sb))
- version = 2;
- else
- version = 1;
/*
+ * Stamp and write the inode buffers.
+ *
* Seed the new inode cluster with a random generation number. This
* prevents short-term reuse of generation numbers if a chunk is
* freed and then immediately reallocated. We use random numbers
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- gen = random32();
- for (j = 0; j < nbufs; j++) {
- /*
- * Get the block.
- */
- d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
- args.agbno + (j * blks_per_cluster));
- fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
- args.mp->m_bsize * blks_per_cluster,
- XFS_BUF_LOCK);
- ASSERT(fbuf);
- ASSERT(!XFS_BUF_GETERROR(fbuf));
+ xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+ random32());
- /*
- * Initialize all inodes in this buffer and then log them.
- *
- * XXX: It would be much better if we had just one transaction to
- * log a whole cluster of inodes instead of all the individual
- * transactions causing a lot of log traffic.
- */
- xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
- for (i = 0; i < ninodes; i++) {
- int ioffset = i << args.mp->m_sb.sb_inodelog;
- uint isize = sizeof(struct xfs_dinode);
-
- free = xfs_make_iptr(args.mp, fbuf, i);
- free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- free->di_version = version;
- free->di_gen = cpu_to_be32(gen);
- free->di_next_unlinked = cpu_to_be32(NULLAGINO);
- xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
- }
- xfs_trans_inode_alloc_buf(tp, fbuf);
- }
+ /*
+ * Convert the results.
+ */
+ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
- agno = be32_to_cpu(agi->agi_seqno);
down_read(&args.mp->m_peraglock);
args.mp->m_perag[agno].pagi_freecount += newlen;
up_read(&args.mp->m_peraglock);
agi->agi_newino = cpu_to_be32(newino);
+
/*
* Insert records describing the new inode chunk into the btree.
*/
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
for (thisino = newino;
thisino < newino + newlen;
thisino += XFS_INODES_PER_CHUNK) {
- if ((error = xfs_inobt_lookup_eq(cur, thisino,
- XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+ cur->bc_rec.i.ir_startino = thisino;
+ cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+ cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+ if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
ASSERT(i == 0);
- if ((error = xfs_btree_insert(cur, &i))) {
+ error = xfs_btree_insert(cur, &i);
+ if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
@@ -539,6 +557,62 @@ nextag:
}
/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+ struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ if (left)
+ error = xfs_btree_decrement(cur, 0, &i);
+ else
+ error = xfs_btree_increment(cur, 0, &i);
+
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t agino,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+/*
* Visible inode allocation functions.
*/
@@ -592,8 +666,8 @@ xfs_dialloc(
int j; /* result code */
xfs_mount_t *mp; /* file system mount structure */
int offset; /* index of inode in chunk */
- xfs_agino_t pagino; /* parent's a.g. relative inode # */
- xfs_agnumber_t pagno; /* parent's allocation group number */
+ xfs_agino_t pagino; /* parent's AG relative inode # */
+ xfs_agnumber_t pagno; /* parent's AG number */
xfs_inobt_rec_incore_t rec; /* inode allocation record */
xfs_agnumber_t tagno; /* testing allocation group number */
xfs_btree_cur_t *tcur; /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
*/
agno = tagno;
*IO_agbp = NULL;
+
+ restart_pagno:
cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
/*
* If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
*/
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
/*
- * If in the same a.g. as the parent, try to get near the parent.
+ * If in the same AG as the parent, try to get near the parent.
*/
if (pagno == agno) {
- if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+ xfs_perag_t *pag = &mp->m_perag[agno];
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+ int searchdistance = 10;
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
goto error0;
- if (i != 0 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (rec.ir_freecount > 0) {
/*
* Found a free inode in the same chunk
- * as parent, done.
+ * as the parent, done.
*/
+ goto alloc_inode;
}
+
+
+ /*
+ * In the same AG as parent, but parent's chunk is full.
+ */
+
+ /* duplicate the cursor, search left & right simultaneously */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
/*
- * In the same a.g. as parent, but parent's chunk is full.
+ * Skip to last blocks looked up if same parent inode.
*/
- else {
- int doneleft; /* done, to the left */
- int doneright; /* done, to the right */
+ if (pagino != NULLAGINO &&
+ pag->pagl_pagino == pagino &&
+ pag->pagl_leftrec != NULLAGINO &&
+ pag->pagl_rightrec != NULLAGINO) {
+ error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+ &trec, &doneleft, 1);
+ if (error)
+ goto error1;
+ error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+ &rec, &doneright, 0);
if (error)
- goto error0;
- ASSERT(i == 1);
- ASSERT(j == 1);
- /*
- * Duplicate the cursor, search left & right
- * simultaneously.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- goto error0;
- /*
- * Search left with tcur, back up 1 record.
- */
- if ((error = xfs_btree_decrement(tcur, 0, &i)))
goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Search right with cur, go forward 1 record.
- */
- if ((error = xfs_btree_increment(cur, 0, &i)))
+ } else {
+ /* search left with tcur, back up 1 record */
+ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+ if (error)
goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Loop until we find the closest inode chunk
- * with a free one.
- */
- while (!doneleft || !doneright) {
- int useleft; /* using left inode
- chunk this time */
+ /* search right with cur, go forward 1 record. */
+ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+ while (!doneleft || !doneright) {
+ int useleft; /* using left inode chunk this time */
+
+ if (!--searchdistance) {
/*
- * Figure out which block is closer,
- * if both are valid.
- */
- if (!doneleft && !doneright)
- useleft =
- pagino -
- (trec.ir_startino +
- XFS_INODES_PER_CHUNK - 1) <
- rec.ir_startino - pagino;
- else
- useleft = !doneleft;
- /*
- * If checking the left, does it have
- * free inodes?
- */
- if (useleft && trec.ir_freecount) {
- /*
- * Yes, set it up as the chunk to use.
- */
- rec = trec;
- xfs_btree_del_cursor(cur,
- XFS_BTREE_NOERROR);
- cur = tcur;
- break;
- }
- /*
- * If checking the right, does it have
- * free inodes?
- */
- if (!useleft && rec.ir_freecount) {
- /*
- * Yes, it's already set up.
- */
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- break;
- }
- /*
- * If used the left, get another one
- * further left.
- */
- if (useleft) {
- if ((error = xfs_btree_decrement(tcur, 0,
- &i)))
- goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(
- tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
- /*
- * If used the right, get another one
- * further right.
+ * Not in range - save last search
+ * location and allocate a new inode
*/
- else {
- if ((error = xfs_btree_increment(cur, 0,
- &i)))
- goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(
- cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto newino;
+ }
+
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+ (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+ rec.ir_startino - pagino;
+ } else {
+ useleft = !doneleft;
}
- ASSERT(!doneleft || !doneright);
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+ rec = trec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* free inodes to the right? */
+ if (!useleft && rec.ir_freecount) {
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* get next record to check */
+ if (useleft) {
+ error = xfs_ialloc_next_rec(tcur, &trec,
+ &doneleft, 1);
+ } else {
+ error = xfs_ialloc_next_rec(cur, &rec,
+ &doneright, 0);
+ }
+ if (error)
+ goto error1;
}
+
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
}
+
/*
- * In a different a.g. from the parent.
+ * In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
- else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
- if ((error = xfs_inobt_lookup_eq(cur,
- be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+newino:
+ if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
goto error0;
- if (i == 1 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
- /*
- * The last chunk allocated in the group still has
- * a free inode.
- */
- }
- /*
- * None left in the last group, search the whole a.g.
- */
- else {
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- ASSERT(i == 1);
- for (;;) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free,
- &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (rec.ir_freecount > 0)
- break;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (j == 1 && rec.ir_freecount > 0) {
+ /*
+ * The last chunk allocated in the group
+ * still has a free inode.
+ */
+ goto alloc_inode;
}
}
}
+
+ /*
+ * None left in the last group, search the whole AG
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ for (;;) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (rec.ir_freecount > 0)
+ break;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+
+alloc_inode:
offset = xfs_ialloc_find_free(&rec.ir_free);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
- rec.ir_free)))
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
goto error0;
be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
down_read(&mp->m_peraglock);
mp->m_perag[tagno].pagi_freecount--;
up_read(&mp->m_peraglock);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
*inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
* Initialize the cursor.
*/
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
/*
* Look for the entry describing this inode.
*/
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.",
+ "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.",
error, mp->m_fsname);
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
- &rec.ir_free, &i))) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error) {
cmn_err(CE_WARN,
"xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
} else {
*delete = 0;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+ error = xfs_inobt_update(cur, &rec);
+ if (error) {
cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.",
+ "xfs_difree: xfs_inobt_update returned an error %d on %s.",
error, mp->m_fsname);
goto error0;
}
+
/*
* Change the inode free counts and log the ag/sb changes.
*/
@@ -1165,28 +1193,10 @@ xfs_difree(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1297,9 +1307,7 @@ xfs_imap(
chunk_agbno = agbno - offset_agbno;
} else {
xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_agino_t chunk_agino; /* first agino in inode chunk */
- __int32_t chunk_cnt; /* count of free inodes in chunk */
- xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
+ xfs_inobt_rec_incore_t chunk_rec;
xfs_buf_t *agbp; /* agi buffer */
int i; /* temp state */
@@ -1315,15 +1323,14 @@ xfs_imap(
}
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
- error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (error) {
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "xfs_inobt_lookup_le() failed");
+ "xfs_inobt_lookup() failed");
goto error0;
}
- error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
- &chunk_free, &i);
+ error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
if (error) {
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
if (error)
return error;
- chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+ chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
offset_agbno = agbno - chunk_agbno;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
xfs_agnumber_t agno); /* allocation group number */
/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
*/
-int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ xfs_lookup_t dir, int *stat);
/*
* Get the data from the pointed-to record.
*/
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
- __int32_t *fcnt, xfs_inofree_t *free, int *stat);
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec, int *stat);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
ip->i_update_core = 0;
- ip->i_update_size = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
ip->i_size = 0;
@@ -456,32 +455,6 @@ out_error_or_again:
return error;
}
-
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t *mp,
- xfs_ino_t ino,
- xfs_trans_t *tp)
-{
- xfs_inode_t *ip;
- xfs_perag_t *pag;
-
- pag = xfs_get_perag(mp, ino);
- read_lock(&pag->pag_ici_lock);
- ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
-
- /* the returned inode must match the transaction */
- if (ip && (ip->i_transp != tp))
- return NULL;
- return ip;
-}
-
/*
* Decrement reference count of an inode structure and unlock it.
*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
return 0;
}
-void
+STATIC void
xfs_dinode_from_disk(
xfs_icdinode_t *to,
xfs_dinode_t *from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
* In that case the pages will still be in memory, but the inode size
* will never have been updated.
*/
-xfs_fsize_t
+STATIC xfs_fsize_t
xfs_file_last_byte(
xfs_inode_t *ip)
{
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
/*
* Resize an extent indirection array to new_size bytes.
*/
-void
+STATIC void
xfs_iext_realloc_indirect(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
/*
* Switch from indirection array to linear (direct) extent allocations.
*/
-void
+STATIC void
xfs_iext_indirect_to_direct(
xfs_ifork_t *ifp) /* inode fork pointer */
{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
/* Miscellaneous state. */
unsigned short i_flags; /* see defined flags below */
unsigned char i_update_core; /* timestamps/size is dirty */
- unsigned char i_update_size; /* di_size field is dirty */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -468,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
/*
* xfs_iget.c prototypes.
*/
-xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
- struct xfs_trans *);
int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
uint, uint, xfs_inode_t **, xfs_daddr_t);
void xfs_iput(xfs_inode_t *, uint);
@@ -504,7 +501,6 @@ void xfs_ipin(xfs_inode_t *);
void xfs_iunpin(xfs_inode_t *);
int xfs_iflush(xfs_inode_t *, uint);
void xfs_ichgtime(xfs_inode_t *, int);
-xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
@@ -572,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
struct xfs_buf **, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, xfs_daddr_t, uint);
-void xfs_dinode_from_disk(struct xfs_icdinode *,
- struct xfs_dinode *);
void xfs_dinode_to_disk(struct xfs_dinode *,
struct xfs_icdinode *);
void xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void xfs_iext_realloc_indirect(xfs_ifork_t *, int);
-void xfs_iext_indirect_to_direct(xfs_ifork_t *);
void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
void xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
}
/*
- * We don't have to worry about re-ordering here because
- * the update_size field is protected by the inode lock
- * and we have that held in exclusive mode.
- */
- if (ip->i_update_size)
- ip->i_update_size = 0;
-
- /*
* Make sure to get the latest atime from the Linux inode.
*/
xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
* Clear out the fields of the inode log item particular
* to the current transaction.
*/
- iip->ili_ilock_recur = 0;
- iip->ili_iolock_recur = 0;
iip->ili_flags = 0;
/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
struct xfs_inode *ili_inode; /* inode ptr */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
- unsigned short ili_ilock_recur; /* lock recursion count */
- unsigned short ili_iolock_recur; /* lock recursion count */
unsigned short ili_flags; /* misc flags */
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
#if XFS_BIG_INUMS
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
#else
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
#endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
#include "xfs_error.h"
#include "xfs_btree.h"
-int
+STATIC int
xfs_internal_inum(
xfs_mount_t *mp,
xfs_ino_t ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
int end_of_ag; /* set if we've seen the ag end */
int error; /* error code */
int fmterror;/* bulkstat formatter result */
- __int32_t gcnt; /* current btree rec's count */
- xfs_inofree_t gfree; /* current btree rec's free mask */
- xfs_agino_t gino; /* current btree rec's start inode */
int i; /* loop index */
int icount; /* count of inodes good in irbuf */
size_t irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
* we need to get the remainder of the chunk we're in.
*/
if (agino > 0) {
+ xfs_inobt_rec_incore_t r;
+
/*
* Lookup the inode chunk that this inode lives in.
*/
- error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+ &tmp);
if (!error && /* no I/O error */
tmp && /* lookup succeeded */
/* got the record, should always work */
- !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
- &gfree, &i)) &&
+ !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
i == 1 &&
/* this is the right chunk */
- agino < gino + XFS_INODES_PER_CHUNK &&
+ agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
/* lastino was not last in chunk */
- (chunkidx = agino - gino + 1) <
+ (chunkidx = agino - r.ir_startino + 1) <
XFS_INODES_PER_CHUNK &&
/* there are some left allocated */
xfs_inobt_maskn(chunkidx,
- XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+ XFS_INODES_PER_CHUNK - chunkidx) &
+ ~r.ir_free) {
/*
* Grab the chunk record. Mark all the
* uninteresting inodes (because they're
* before our start point) free.
*/
for (i = 0; i < chunkidx; i++) {
- if (XFS_INOBT_MASK(i) & ~gfree)
- gcnt++;
+ if (XFS_INOBT_MASK(i) & ~r.ir_free)
+ r.ir_freecount++;
}
- gfree |= xfs_inobt_maskn(0, chunkidx);
- irbp->ir_startino = gino;
- irbp->ir_freecount = gcnt;
- irbp->ir_free = gfree;
+ r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
irbp++;
- agino = gino + XFS_INODES_PER_CHUNK;
- icount = XFS_INODES_PER_CHUNK - gcnt;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+ icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
} else {
/*
* If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
/*
* Start of ag. Lookup the first inode chunk.
*/
- error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
icount = 0;
}
/*
@@ -501,6 +501,8 @@ xfs_bulkstat(
* until we run out of inodes or space in the buffer.
*/
while (irbp < irbufend && icount < ubcount) {
+ xfs_inobt_rec_incore_t r;
+
/*
* Loop as long as we're unable to read the
* inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
if (XFS_AGINO_TO_AGBNO(mp, agino) >=
be32_to_cpu(agi->agi_length))
break;
- error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
- &tmp);
+ error = xfs_inobt_lookup(cur, agino,
+ XFS_LOOKUP_GE, &tmp);
cond_resched();
}
/*
* If ran off the end of the ag either with an error,
* or the normal way, set end and stop collecting.
*/
- if (error ||
- (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
- &gfree, &i)) ||
- i == 0) {
+ if (error) {
end_of_ag = 1;
break;
}
+
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
+ end_of_ag = 1;
+ break;
+ }
+
/*
* If this chunk has any allocated inodes, save it.
* Also start read-ahead now for this chunk.
*/
- if (gcnt < XFS_INODES_PER_CHUNK) {
+ if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
/*
* Loop over all clusters in the next chunk.
* Do a readahead if there are any allocated
* inodes in that cluster.
*/
- for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
- chunkidx = 0;
+ agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
+ for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
chunkidx += nicluster,
agbno += nbcluster) {
- if (xfs_inobt_maskn(chunkidx,
- nicluster) & ~gfree)
+ if (xfs_inobt_maskn(chunkidx, nicluster)
+ & ~r.ir_free)
xfs_btree_reada_bufs(mp, agno,
agbno, nbcluster);
}
- irbp->ir_startino = gino;
- irbp->ir_freecount = gcnt;
- irbp->ir_free = gfree;
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
irbp++;
- icount += XFS_INODES_PER_CHUNK - gcnt;
+ icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
}
/*
* Set agino to after this chunk and bump the cursor.
*/
- agino = gino + XFS_INODES_PER_CHUNK;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
error = xfs_btree_increment(cur, 0, &tmp);
cond_resched();
}
@@ -820,9 +826,7 @@ xfs_inumbers(
int bufidx;
xfs_btree_cur_t *cur;
int error;
- __int32_t gcnt;
- xfs_inofree_t gfree;
- xfs_agino_t gino;
+ xfs_inobt_rec_incore_t r;
int i;
xfs_ino_t ino;
int left;
@@ -855,7 +859,8 @@ xfs_inumbers(
continue;
}
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
- error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+ &tmp);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
continue;
}
}
- if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
- &i)) ||
- i == 0) {
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
xfs_buf_relse(agbp);
agbp = NULL;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
agino = 0;
continue;
}
- agino = gino + XFS_INODES_PER_CHUNK - 1;
- buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
- buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
- buffer[bufidx].xi_allocmask = ~gfree;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+ buffer[bufidx].xi_startino =
+ XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+ buffer[bufidx].xi_alloccount =
+ XFS_INODES_PER_CHUNK - r.ir_freecount;
+ buffer[bufidx].xi_allocmask = ~r.ir_free;
bufidx++;
left--;
if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
void *dibuff,
int *stat);
-int
-xfs_internal_inum(
- xfs_mount_t *mp,
- xfs_ino_t ino);
-
typedef int (*inumbers_fmt_pf)(
void __user *ubuffer, /* buffer to write to */
const xfs_inogrp_t *buffer, /* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log,
extern int xlog_recover(xlog_t *log);
extern int xlog_recover_finish(xlog_t *log);
extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern void xlog_recover_process_iunlinks(xlog_t *log);
-
extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
extern void xlog_put_bp(struct xfs_buf *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
* freeing of the inode and its removal from the list must be
* atomic.
*/
-void
+STATIC void
xlog_recover_process_iunlinks(
xlog_t *log)
{
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
*
* The m_sb_lock must be held when this routine is called.
*/
-int
+STATIC int
xfs_mod_incore_sb_unlocked(
xfs_mount_t *mp,
xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
extern int xfs_log_sbcount(xfs_mount_t *, uint);
extern int xfs_mountfs(xfs_mount_t *mp);
-extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_unmountfs_writesb(xfs_mount_t *);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
- int64_t, int);
extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
uint, int);
extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
}
/*
- * To look up an element using its key, but leave its location in the internal
- * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
- * function returns NULL.
- *
- * See the comments above the declaration of the xfs_mru_cache_lookup() function
- * for important locking information pertaining to this call.
- */
-void *
-xfs_mru_cache_peek(
- xfs_mru_cache_t *mru,
- unsigned long key)
-{
- xfs_mru_cache_elem_t *elem;
-
- ASSERT(mru && mru->lists);
- if (!mru || !mru->lists)
- return NULL;
-
- spin_lock(&mru->lock);
- elem = radix_tree_lookup(&mru->store, key);
- if (!elem)
- spin_unlock(&mru->lock);
- else
- __release(mru_lock); /* help sparse not be stupid */
-
- return elem ? elem->value : NULL;
-}
-
-/*
* To release the internal data structure spinlock after having performed an
* xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
* with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_done(struct xfs_mru_cache *mru);
#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
}
/*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
- xfs_mount_t *mp,
- xfs_inode_t *ip)
-{
- int error = 0;
-
- /*
- * If we're treating this as O_DSYNC and we have not updated the
- * size, force the log.
- */
- if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
- !(ip->i_update_size)) {
- xfs_inode_log_item_t *iip = ip->i_itemp;
-
- /*
- * If an allocation transaction occurred
- * without extending the size, then we have to force
- * the log up the proper point to ensure that the
- * allocation is permanent. We can't count on
- * the fact that buffered writes lock out direct I/O
- * writes - the direct I/O write could have extended
- * the size nontransactionally, then finished before
- * we started. xfs_write_file will think that the file
- * didn't grow but the update isn't safe unless the
- * size change is logged.
- *
- * Force the log if we've committed a transaction
- * against the inode or if someone else has and
- * the commit record hasn't gone to disk (e.g.
- * the inode is pinned). This guarantees that
- * all changes affecting the inode are permanent
- * when we return.
- */
- if (iip && iip->ili_last_lsn) {
- error = _xfs_log_force(mp, iip->ili_last_lsn,
- XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
- } else if (xfs_ipincount(ip) > 0) {
- error = _xfs_log_force(mp, (xfs_lsn_t)0,
- XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
- }
-
- } else {
- xfs_trans_t *tp;
-
- /*
- * O_SYNC or O_DSYNC _with_ a size update are handled
- * the same way.
- *
- * If the write was synchronous then we need to make
- * sure that the inode modification time is permanent.
- * We'll have updated the timestamp above, so here
- * we use a synchronous transaction to log the inode.
- * It's not fast, but it's necessary.
- *
- * If this a dsync write and the size got changed
- * non-transactionally, then we need to ensure that
- * the size change gets logged in a synchronous
- * transaction.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_SWRITE_LOG_RES(mp),
- 0, 0, 0))) {
- /* Transaction reserve failed */
- xfs_trans_cancel(tp, 0);
- } else {
- /* Transaction reserve successful */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- }
-
- return error;
-}
-
-/*
* Force a shutdown of the filesystem instantly while keeping
* the filesystem consistent. We don't do an unmount here; just shutdown
* the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
* Prototypes for functions in xfs_rw.c.
*/
extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
extern int xfs_bioerror(struct xfs_buf *bp);
extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
xfs_buf_t *bp, xfs_daddr_t blkno);
-/*
- * Prototypes for functions in xfs_vnodeops.c.
- */
-extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
- int flags);
-
#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_GROWFS 14
#define XFS_TRANS_STRAT_WRITE 15
#define XFS_TRANS_DIOSTRAT 16
-#define XFS_TRANS_WRITE_SYNC 17
+/* 17 was XFS_TRANS_WRITE_SYNC */
#define XFS_TRANS_WRITEID 18
#define XFS_TRANS_ADDAFORK 19
#define XFS_TRANS_ATTRINVAL 20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
return (flags & XFS_BUF_TRYLOCK) ?
EAGAIN : XFS_ERROR(ENOMEM);
- if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+ if (XFS_BUF_GETERROR(bp) != 0) {
xfs_ioerror_alert("xfs_trans_read_buf", mp,
bp, blkno);
error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
return error;
}
#ifdef DEBUG
- if (xfs_do_error && (bp != NULL)) {
+ if (xfs_do_error) {
if (xfs_error_target == target) {
if (((xfs_req_num++) % xfs_error_mod) == 0) {
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
/*
- * Get and lock the inode for the caller if it is not already
- * locked within the given transaction. If it is already locked
- * within the transaction, just increment its lock recursion count
- * and return a pointer to it.
- *
- * For an inode to be locked in a transaction, the inode lock, as
- * opposed to the io lock, must be taken exclusively. This ensures
- * that the inode can be involved in only 1 transaction at a time.
- * Lock recursion is handled on the io lock, but only for lock modes
- * of equal or lesser strength. That is, you can recur on the io lock
- * held EXCL with a SHARED request but not vice versa. Also, if
- * the inode is already a part of the transaction then you cannot
- * go from not holding the io lock to having it EXCL or SHARED.
- *
- * Use the inode cache routine xfs_inode_incore() to find the inode
- * if it is already owned by this transaction.
- *
- * If we don't already own the inode, use xfs_iget() to get it.
- * Since the inode log item structure is embedded in the incore
- * inode structure and is initialized when the inode is brought
- * into memory, there is nothing to do with it here.
- *
- * If the given transaction pointer is NULL, just call xfs_iget().
- * This simplifies code which must handle both cases.
+ * Get an inode and join it to the transaction.
*/
int
xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
xfs_inode_t **ipp)
{
int error;
- xfs_inode_t *ip;
-
- /*
- * If the transaction pointer is NULL, just call the normal
- * xfs_iget().
- */
- if (tp == NULL)
- return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
-
- /*
- * If we find the inode in core with this transaction
- * pointer in its i_transp field, then we know we already
- * have it locked. In this case we just increment the lock
- * recursion count and return the inode to the caller.
- * Assert that the inode is already locked in the mode requested
- * by the caller. We cannot do lock promotions yet, so
- * die if someone gets this wrong.
- */
- if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
- /*
- * Make sure that the inode lock is held EXCL and
- * that the io lock is never upgraded when the inode
- * is already a part of the transaction.
- */
- ASSERT(ip->i_itemp != NULL);
- ASSERT(lock_flags & XFS_ILOCK_EXCL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
- (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
- ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
- (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
-
- if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
- ip->i_itemp->ili_iolock_recur++;
- }
- if (lock_flags & XFS_ILOCK_EXCL) {
- ip->i_itemp->ili_ilock_recur++;
- }
- *ipp = ip;
- return 0;
- }
-
- ASSERT(lock_flags & XFS_ILOCK_EXCL);
- error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
- if (error) {
- return error;
- }
- ASSERT(ip != NULL);
- xfs_trans_ijoin(tp, ip, lock_flags);
- *ipp = ip;
- return 0;
+ error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
+ if (!error && tp)
+ xfs_trans_ijoin(tp, *ipp, lock_flags);
+ return error;
}
/*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
ASSERT(iip->ili_flags == 0);
- ASSERT(iip->ili_ilock_recur == 0);
- ASSERT(iip->ili_iolock_recur == 0);
/*
* Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
xfs_inode_t *ip)
{
xfs_trans_t *tp;
- int error;
+ int error = 0;
int log_flushed = 0, changed = 1;
xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return XFS_ERROR(EIO);
- /* capture size updates in I/O completion before writing the inode. */
- error = xfs_wait_on_pages(ip, 0, -1);
- if (error)
- return XFS_ERROR(error);
-
/*
* We always need to make sure that the required inode state is safe on
- * disk. The vnode might be clean but we still might need to force the
+ * disk. The inode might be clean but we still might need to force the
* log because of committed transactions that haven't hit the disk yet.
* Likewise, there could be unflushed non-transactional changes to the
* inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (!(ip->i_update_size || ip->i_update_core)) {
+ if (!ip->i_update_core) {
/*
* Timestamps/size haven't changed since last inode flush or
* inode transaction commit. That means either nothing got
@@ -718,7 +713,7 @@ xfs_fsync(
* when the link count isn't zero and by xfs_dm_punch_hole() when
* punching a hole to EOF.
*/
-int
+STATIC int
xfs_free_eofblocks(
xfs_mount_t *mp,
xfs_inode_t *ip,
@@ -1476,8 +1471,8 @@ xfs_create(
if (error == ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(dp);
- error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, resblks, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
}
if (error == ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
OpenPOWER on IntegriCloud