diff options
Diffstat (limited to 'fs')
172 files changed, 4421 insertions, 1458 deletions
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index b573c3b9a328..8cf941c3b511 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -450,7 +450,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) pr_warn("Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; va_end(args); diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 675148950fed..2b2112475ec2 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -19,7 +19,7 @@ affs_count_free_blocks(struct super_block *sb) pr_debug("%s()\n", __func__); - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; mutex_lock(&AFFS_SB(sb)->s_bmlock); diff --git a/fs/affs/super.c b/fs/affs/super.c index 7bf47a41cb4f..884bedab7266 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -80,7 +80,7 @@ void affs_mark_sb_dirty(struct super_block *sb) struct affs_sb_info *sbi = AFFS_SB(sb); unsigned long delay; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; spin_lock(&sbi->work_lock); @@ -464,7 +464,7 @@ got_root: * not recommended. */ if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS - || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) { + || chksum == MUFS_DCOFS) && !sb_rdonly(sb)) { pr_notice("Dircache FS - mounting %s read only\n", sb->s_id); sb->s_flags |= MS_RDONLY; } @@ -596,7 +596,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) memcpy(sbi->s_volume, volume, 32); spin_unlock(&sbi->symlink_lock); - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) return 0; if (*flags & MS_RDONLY) @@ -1606,12 +1606,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; } - if ((req->common.ki_flags & IOCB_NOWAIT) && - !(req->common.ki_flags & IOCB_DIRECT)) { - ret = -EOPNOTSUPP; - goto out_put_req; - } - ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 24a58bf9ca72..4ac49d038bf3 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -56,19 +56,14 @@ static int autofs4_write(struct autofs_sb_info *sbi, struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; - mm_segment_t fs; const char *data = (const char *)addr; ssize_t wr = 0; sigpipe = sigismember(¤t->pending.signal, SIGPIPE); - /* Save pointer to user space and point back to kernel space */ - fs = get_fs(); - set_fs(KERNEL_DS); - mutex_lock(&sbi->pipe_mutex); while (bytes) { - wr = __vfs_write(file, data, bytes, &file->f_pos); + wr = __kernel_write(file, data, bytes, &file->f_pos); if (wr <= 0) break; data += wr; @@ -76,8 +71,6 @@ static int autofs4_write(struct autofs_sb_info *sbi, } mutex_unlock(&sbi->pipe_mutex); - set_fs(fs); - /* Keep the currently executing process from receiving a * SIGPIPE unless it was already supposed to get one */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 4a4a5a366158..a92355cc453b 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -838,7 +838,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) befs_debug(sb, "---> %s", __func__); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { befs_warning(sb, "No write support. Marking filesystem read-only"); sb->s_flags |= MS_RDONLY; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 9be82c4e14a4..ce1824f47ba6 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -341,11 +341,12 @@ static int load_aout_library(struct file *file) unsigned long error; int retval; struct exec ex; + loff_t pos = 0; inode = file_inode(file); retval = -ENOEXEC; - error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); + error = kernel_read(file, &ex, sizeof(ex), &pos); if (error != sizeof(ex)) goto out; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ec45d24875b1..73b01e474fdc 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -409,6 +409,7 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, { struct elf_phdr *elf_phdata = NULL; int retval, size, err = -1; + loff_t pos = elf_ex->e_phoff; /* * If the size of this structure has changed, then punt, since @@ -432,8 +433,7 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, goto out; /* Read in the program headers */ - retval = kernel_read(elf_file, elf_ex->e_phoff, - (char *)elf_phdata, size); + retval = kernel_read(elf_file, elf_phdata, size, &pos); if (retval != size) { err = (retval < 0) ? retval : -EIO; goto out; @@ -698,6 +698,7 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elfhdr interp_elf_ex; } *loc; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; + loff_t pos; loc = kmalloc(sizeof(*loc), GFP_KERNEL); if (!loc) { @@ -750,9 +751,9 @@ static int load_elf_binary(struct linux_binprm *bprm) if (!elf_interpreter) goto out_free_ph; - retval = kernel_read(bprm->file, elf_ppnt->p_offset, - elf_interpreter, - elf_ppnt->p_filesz); + pos = elf_ppnt->p_offset; + retval = kernel_read(bprm->file, elf_interpreter, + elf_ppnt->p_filesz, &pos); if (retval != elf_ppnt->p_filesz) { if (retval >= 0) retval = -EIO; @@ -776,9 +777,9 @@ static int load_elf_binary(struct linux_binprm *bprm) would_dump(bprm, interpreter); /* Get the exec headers */ - retval = kernel_read(interpreter, 0, - (void *)&loc->interp_elf_ex, - sizeof(loc->interp_elf_ex)); + pos = 0; + retval = kernel_read(interpreter, &loc->interp_elf_ex, + sizeof(loc->interp_elf_ex), &pos); if (retval != sizeof(loc->interp_elf_ex)) { if (retval >= 0) retval = -EIO; @@ -1175,9 +1176,10 @@ static int load_elf_library(struct file *file) unsigned long elf_bss, bss, len; int retval, error, i, j; struct elfhdr elf_ex; + loff_t pos = 0; error = -ENOEXEC; - retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex)); + retval = kernel_read(file, &elf_ex, sizeof(elf_ex), &pos); if (retval != sizeof(elf_ex)) goto out; @@ -1201,7 +1203,8 @@ static int load_elf_library(struct file *file) eppnt = elf_phdata; error = -ENOEXEC; - retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j); + pos = elf_ex.e_phoff; + retval = kernel_read(file, eppnt, j, &pos); if (retval != j) goto out_free_ph; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 5aa9199dfb13..e70c039ac190 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -145,6 +145,7 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct elf32_phdr *phdr; unsigned long size; int retval, loop; + loff_t pos = params->hdr.e_phoff; if (params->hdr.e_phentsize != sizeof(struct elf_phdr)) return -ENOMEM; @@ -156,8 +157,7 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, if (!params->phdrs) return -ENOMEM; - retval = kernel_read(file, params->hdr.e_phoff, - (char *) params->phdrs, size); + retval = kernel_read(file, params->phdrs, size, &pos); if (unlikely(retval != size)) return retval < 0 ? retval : -ENOEXEC; @@ -199,6 +199,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) char *interpreter_name = NULL; int executable_stack; int retval, i; + loff_t pos; kdebug("____ LOAD %d ____", current->pid); @@ -246,10 +247,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) if (!interpreter_name) goto error; - retval = kernel_read(bprm->file, - phdr->p_offset, - interpreter_name, - phdr->p_filesz); + pos = phdr->p_offset; + retval = kernel_read(bprm->file, interpreter_name, + phdr->p_filesz, &pos); if (unlikely(retval != phdr->p_filesz)) { if (retval >= 0) retval = -ENOEXEC; @@ -277,8 +277,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) */ would_dump(bprm, interpreter); - retval = kernel_read(interpreter, 0, bprm->buf, - BINPRM_BUF_SIZE); + pos = 0; + retval = kernel_read(interpreter, bprm->buf, + BINPRM_BUF_SIZE, &pos); if (unlikely(retval != BINPRM_BUF_SIZE)) { if (retval >= 0) retval = -ENOEXEC; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index ce6537c50ec1..475d083f8088 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -176,19 +176,14 @@ static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start #define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ #define RESERVED 0xC0 /* bit 6,7: reserved */ -static int decompress_exec( - struct linux_binprm *bprm, - unsigned long offset, - char *dst, - long len, - int fd) +static int decompress_exec(struct linux_binprm *bprm, loff_t fpos, char *dst, + long len, int fd) { unsigned char *buf; z_stream strm; - loff_t fpos; int ret, retval; - pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len); + pr_debug("decompress_exec(offset=%llx,buf=%p,len=%lx)\n", fpos, dst, len); memset(&strm, 0, sizeof(strm)); strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); @@ -202,13 +197,11 @@ static int decompress_exec( } /* Read in first chunk of data and parse gzip header. */ - fpos = offset; - ret = kernel_read(bprm->file, offset, buf, LBUFSIZE); + ret = kernel_read(bprm->file, buf, LBUFSIZE, &fpos); strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; - fpos += ret; retval = -ENOEXEC; @@ -274,7 +267,7 @@ static int decompress_exec( } while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { - ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE); + ret = kernel_read(bprm->file, buf, LBUFSIZE, &fpos); if (ret <= 0) break; len -= ret; @@ -282,7 +275,6 @@ static int decompress_exec( strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; - fpos += ret; } if (ret < 0) { diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index f4718098ac31..ce7181ea60fa 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -218,12 +218,15 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->file = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) { + loff_t pos = 0; + /* * No need to call prepare_binprm(), it's already been * done. bprm->buf is stale, update from interp_file. */ memset(bprm->buf, 0, BINPRM_BUF_SIZE); - retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); + retval = kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, + &pos); } else retval = prepare_binprm(bprm); diff --git a/fs/block_dev.c b/fs/block_dev.c index bb715b2fcfb8..93d088ffc05c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1740,6 +1740,8 @@ static int blkdev_open(struct inode * inode, struct file * filp) */ filp->f_flags |= O_LARGEFILE; + filp->f_mode |= FMODE_NOWAIT; + if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; if (filp->f_flags & O_EXCL) @@ -1892,6 +1894,9 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos >= size) return -ENOSPC; + if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) + return -EOPNOTSUPP; + iov_iter_truncate(from, size - iocb->ki_pos); blk_start_plug(&plug); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 80e9c18ea64f..a26c63b4ad68 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -6,6 +6,8 @@ config BTRFS_FS select ZLIB_DEFLATE select LZO_COMPRESS select LZO_DECOMPRESS + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 128ce17a80b0..962a95aefb81 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o free-space-cache.o zlib.o lzo.o \ + export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o hash.o free-space-tree.o diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 883ecc58fd0d..b51d23f5cafa 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -704,6 +704,7 @@ static struct { static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, + &btrfs_zstd_compress, }; void __init btrfs_init_compress(void) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 3b1b0ac15fdc..d2781ff8f994 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -99,7 +99,8 @@ enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_COMPRESS_TYPES = 3, }; struct btrfs_compress_op { @@ -127,6 +128,7 @@ struct btrfs_compress_op { extern const struct btrfs_compress_op btrfs_zlib_compress; extern const struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zstd_compress; int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2add002662f4..5a8933da39a7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -270,6 +270,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7a93a3e1a847..7c655f9a7a50 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -704,7 +704,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) u64 result; int ret; - if (fs_info->sb->s_flags & MS_RDONLY) + if (sb_rdonly(fs_info->sb)) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 46329524dd5f..487bbe4fb3c6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2478,7 +2478,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return ret; } - if (fs_info->sb->s_flags & MS_RDONLY) { + if (sb_rdonly(fs_info->sb)) { ret = btrfs_commit_super(fs_info); if (ret) return ret; @@ -2828,6 +2828,8 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); @@ -2874,7 +2876,7 @@ int open_ctree(struct super_block *sb, features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; - if (!(sb->s_flags & MS_RDONLY) && features) { + if (!sb_rdonly(sb) && features) { btrfs_err(fs_info, "cannot mount read-write because of unsupported optional features (%llx)", features); @@ -3036,7 +3038,7 @@ retry_root_backup: goto fail_sysfs; } - if (!(sb->s_flags & MS_RDONLY) && !btrfs_check_rw_degradable(fs_info)) { + if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) { btrfs_warn(fs_info, "writeable mount is not allowed due to too many missing devices"); goto fail_sysfs; @@ -3095,7 +3097,7 @@ retry_root_backup: if (ret) goto fail_qgroup; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto fail_qgroup; @@ -3121,7 +3123,7 @@ retry_root_backup: goto fail_qgroup; } - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && @@ -3876,7 +3878,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); - if (!(fs_info->sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(fs_info->sb)) { /* * If the cleaner thread is stopped and there are * block groups queued for removal, the deletion will be diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0f077c5db58e..3e5bb0cdd3cd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2060,7 +2060,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; - if (fs_info->sb->s_flags & MS_RDONLY) + if (sb_rdonly(fs_info->sb)) return -EROFS; for (i = 0; i < num_pages; i++) { @@ -2110,7 +2110,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, failrec->start); goto out; } - if (fs_info->sb->s_flags & MS_RDONLY) + if (sb_rdonly(fs_info->sb)) goto out; spin_lock(&io_tree->lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 74fd7756cff3..aafcc785f840 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1886,6 +1886,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t oldsize; int clean_page = 0; + if (!(iocb->ki_flags & IOCB_DIRECT) && + (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; @@ -3112,7 +3116,7 @@ out: static int btrfs_file_open(struct inode *inode, struct file *filp) { - filp->f_mode |= FMODE_AIO_NOWAIT; + filp->f_mode |= FMODE_NOWAIT; return generic_file_open(inode, filp); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17ad018da0a2..128f3e58634f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5821,7 +5821,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (!IS_ERR(inode) && root != sub_root) { down_read(&fs_info->cleanup_work_sem); - if (!(inode->i_sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ae8fbf9d3de2..d6715c2bcdc4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -296,8 +296,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (fs_info->compress_type == BTRFS_COMPRESS_LZO) comp = "lzo"; - else + else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB) comp = "zlib"; + else + comp = "zstd"; ret = btrfs_set_prop(inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) @@ -1435,6 +1437,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (range->compress_type == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } ret = defrag_count; @@ -4422,7 +4426,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: - if (fs_info->sb->s_flags & MS_RDONLY) { + if (sb_rdonly(fs_info->sb)) { ret = -EROFS; goto out; } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 09c0266f248d..f6a05f836629 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -390,6 +390,8 @@ static int prop_compression_validate(const char *value, size_t len) return 0; else if (!strncmp("zlib", value, len)) return 0; + else if (!strncmp("zstd", value, len)) + return 0; return -EINVAL; } @@ -412,6 +414,8 @@ static int prop_compression_apply(struct inode *inode, type = BTRFS_COMPRESS_LZO; else if (!strncmp("zlib", value, 4)) type = BTRFS_COMPRESS_ZLIB; + else if (!strncmp("zstd", value, len)) + type = BTRFS_COMPRESS_ZSTD; else return -EINVAL; @@ -429,6 +433,8 @@ static const char *prop_compression_extract(struct inode *inode) return "zlib"; case BTRFS_COMPRESS_LZO: return "lzo"; + case BTRFS_COMPRESS_ZSTD: + return "zstd"; } return NULL; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9fb9896610e0..95bcc3cce78f 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -228,7 +228,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) int ret; bool can_recover = true; - if (fs_info->sb->s_flags & MS_RDONLY) + if (sb_rdonly(fs_info->sb)) can_recover = false; path = btrfs_alloc_path(); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f1d3d6e7087..32b043ef8ac9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -539,33 +539,23 @@ static struct btrfs_path *alloc_path_for_send(void) static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) { int ret; - mm_segment_t old_fs; u32 pos = 0; - old_fs = get_fs(); - set_fs(KERNEL_DS); - while (pos < len) { - ret = vfs_write(filp, (__force const char __user *)buf + pos, - len - pos, off); + ret = kernel_write(filp, buf + pos, len - pos, off); /* TODO handle that correctly */ /*if (ret == -ERESTARTSYS) { continue; }*/ if (ret < 0) - goto out; + return ret; if (ret == 0) { - ret = -EIO; - goto out; + return -EIO; } pos += ret; } - ret = 0; - -out: - set_fs(old_fs); - return ret; + return 0; } static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0b7a1d8cd08b..35a128acfbd1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -103,7 +103,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { struct super_block *sb = fs_info->sb; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { @@ -139,7 +139,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Special case: if the error is EROFS, and we're already * under MS_RDONLY, then it is safe here. */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + if (errno == -EROFS && sb_rdonly(sb)) return; #ifdef CONFIG_PRINTK @@ -514,6 +514,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); no_compress = 0; + } else if (strcmp(args[0].from, "zstd") == 0) { + compress_type = "zstd"; + info->compress_type = BTRFS_COMPRESS_ZSTD; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + btrfs_set_fs_incompat(info, COMPRESS_ZSTD); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); @@ -1230,8 +1238,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; - else + else if (info->compress_type == BTRFS_COMPRESS_LZO) compress_type = "lzo"; + else + compress_type = "zstd"; if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else @@ -1691,8 +1701,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && - (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || - (fs_info->sb->s_flags & MS_RDONLY))) { + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) { btrfs_cleanup_defrag_inodes(fs_info); } @@ -1739,7 +1748,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) goto out; if (*flags & MS_RDONLY) { @@ -1840,7 +1849,7 @@ out: restore: /* We've hit an error - don't reset MS_RDONLY */ - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) old_flags |= MS_RDONLY; sb->s_flags = old_flags; fs_info->mount_opt = old_opts; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c2d5f3580b4c..883881b16c86 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -120,7 +120,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, if (!fs_info) return -EPERM; - if (fs_info->sb->s_flags & MS_RDONLY) + if (sb_rdonly(fs_info->sb)) return -EROFS; ret = kstrtoul(skip_spaces(buf), 0, &val); @@ -200,6 +200,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); @@ -212,6 +213,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(compress_zstd), BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), @@ -388,7 +390,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, if (!fs_info) return -EPERM; - if (fs_info->sb->s_flags & MS_RDONLY) + if (sb_rdonly(fs_info->sb)) return -EROFS; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c188256a367c..0e8f16c305df 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2324,7 +2324,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path int seeding_dev = 0; int ret = 0; - if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding) + if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, @@ -4053,7 +4053,7 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { - if (fs_info->sb->s_flags & MS_RDONLY) + if (sb_rdonly(fs_info->sb)) return -EROFS; mutex_lock(&fs_info->balance_mutex); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c new file mode 100644 index 000000000000..607ce47b483a --- /dev/null +++ b/fs/btrfs/zstd.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bio.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/refcount.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/zstd.h> +#include "compression.h" + +#define ZSTD_BTRFS_MAX_WINDOWLOG 17 +#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_DEFAULT_LEVEL 3 + +static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) +{ + ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, + src_len, 0); + + if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) + params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; + WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT); + return params; +} + +struct workspace { + void *mem; + size_t size; + char *buf; + struct list_head list; +}; + +static void zstd_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + kvfree(workspace->mem); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zstd_alloc_workspace(void) +{ + ZSTD_parameters params = + zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT); + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->size = max_t(size_t, + ZSTD_CStreamWorkspaceBound(params.cParams), + ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!workspace->mem || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zstd_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static int zstd_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, + struct page **pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + ZSTD_CStream *stream; + int ret = 0; + int nr_pages = 0; + struct page *in_page = NULL; /* The current page to read */ + struct page *out_page = NULL; /* The current page to write to */ + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long len = *total_out; + const unsigned long nr_dest_pages = *out_pages; + unsigned long max_out = nr_dest_pages * PAGE_SIZE; + ZSTD_parameters params = zstd_get_btrfs_parameters(len); + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + /* Initialize the stream */ + stream = ZSTD_initCStream(params, len, workspace->mem, + workspace->size); + if (!stream) { + pr_warn("BTRFS: ZSTD_initCStream failed\n"); + ret = -EIO; + goto out; + } + + /* map in the first page of input data */ + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + in_buf.src = kmap(in_page); + in_buf.pos = 0; + in_buf.size = min_t(size_t, len, PAGE_SIZE); + + + /* Allocate and map in the output buffer */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + + while (1) { + size_t ret2; + + ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_compressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto out; + } + + /* Check to see if we are making it bigger */ + if (tot_in + in_buf.pos > 8192 && + tot_in + in_buf.pos < + tot_out + out_buf.pos) { + ret = -E2BIG; + goto out; + } + + /* We've reached the end of our output range */ + if (out_buf.pos >= max_out) { + tot_out += out_buf.pos; + ret = -E2BIG; + goto out; + } + + /* Check if we need more output space */ + if (out_buf.pos == out_buf.size) { + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + } + + /* We've reached the end of the input */ + if (in_buf.pos >= len) { + tot_in += in_buf.pos; + break; + } + + /* Check if we need more input */ + if (in_buf.pos == in_buf.size) { + tot_in += PAGE_SIZE; + kunmap(in_page); + put_page(in_page); + + start += PAGE_SIZE; + len -= PAGE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + in_buf.src = kmap(in_page); + in_buf.pos = 0; + in_buf.size = min_t(size_t, len, PAGE_SIZE); + } + } + while (1) { + size_t ret2; + + ret2 = ZSTD_endStream(stream, &out_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_endStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto out; + } + if (ret2 == 0) { + tot_out += out_buf.pos; + break; + } + if (out_buf.pos >= max_out) { + tot_out += out_buf.pos; + ret = -E2BIG; + goto out; + } + + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + } + + if (tot_out >= tot_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; + *total_in = tot_in; + *total_out = tot_out; +out: + *out_pages = nr_pages; + /* Cleanup */ + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + if (out_page) + kunmap(out_page); + return ret; +} + +static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; + size_t srclen = cb->compressed_len; + ZSTD_DStream *stream; + int ret = 0; + unsigned long page_in_index = 0; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long buf_start; + unsigned long total_out = 0; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + + stream = ZSTD_initDStream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_debug("BTRFS: ZSTD_initDStream failed\n"); + ret = -EIO; + goto done; + } + + in_buf.src = kmap(pages_in[page_in_index]); + in_buf.pos = 0; + in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + + out_buf.dst = workspace->buf; + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + + while (1) { + size_t ret2; + + ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto done; + } + buf_start = total_out; + total_out += out_buf.pos; + out_buf.pos = 0; + + ret = btrfs_decompress_buf2page(out_buf.dst, buf_start, + total_out, disk_start, orig_bio); + if (ret == 0) + break; + + if (in_buf.pos >= srclen) + break; + + /* Check if we've hit the end of a frame */ + if (ret2 == 0) + break; + + if (in_buf.pos == in_buf.size) { + kunmap(pages_in[page_in_index++]); + if (page_in_index >= total_pages_in) { + in_buf.src = NULL; + ret = -EIO; + goto done; + } + srclen -= PAGE_SIZE; + in_buf.src = kmap(pages_in[page_in_index]); + in_buf.pos = 0; + in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + } + } + ret = 0; + zero_fill_bio(orig_bio); +done: + if (in_buf.src) + kunmap(pages_in[page_in_index]); + return ret; +} + +static int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + ZSTD_DStream *stream; + int ret = 0; + size_t ret2; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + unsigned long total_out = 0; + unsigned long pg_offset = 0; + char *kaddr; + + stream = ZSTD_initDStream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_warn("BTRFS: ZSTD_initDStream failed\n"); + ret = -EIO; + goto finish; + } + + destlen = min_t(size_t, destlen, PAGE_SIZE); + + in_buf.src = data_in; + in_buf.pos = 0; + in_buf.size = srclen; + + out_buf.dst = workspace->buf; + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + + ret2 = 1; + while (pg_offset < destlen && in_buf.pos < in_buf.size) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + /* Check if the frame is over and we still need more input */ + if (ret2 == 0) { + pr_debug("BTRFS: ZSTD_decompressStream ended early\n"); + ret = -EIO; + goto finish; + } + ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto finish; + } + + buf_start = total_out; + total_out += out_buf.pos; + out_buf.pos = 0; + + if (total_out <= start_byte) + continue; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min_t(unsigned long, destlen - pg_offset, + out_buf.size - buf_offset); + + kaddr = kmap_atomic(dest_page); + memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes); + kunmap_atomic(kaddr); + + pg_offset += bytes; + } + ret = 0; +finish: + if (pg_offset < destlen) { + kaddr = kmap_atomic(dest_page); + memset(kaddr + pg_offset, 0, destlen - pg_offset); + kunmap_atomic(kaddr); + } + return ret; +} + +const struct btrfs_compress_op btrfs_zstd_compress = { + .alloc_workspace = zstd_alloc_workspace, + .free_workspace = zstd_free_workspace, + .compress_pages = zstd_compress_pages, + .decompress_bio = zstd_decompress_bio, + .decompress = zstd_decompress, +}; diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 3ff867f87d73..d9f001078e08 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -133,7 +133,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) goto error_unsupported; ret = -EROFS; - if (root->d_sb->s_flags & MS_RDONLY) + if (sb_rdonly(root->d_sb)) goto error_unsupported; /* determine the security of the on-disk cache as this governs diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1bc709fe330a..b3e3edc09d80 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -152,17 +152,10 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, ceph_invalidate_fscache_page(inode, page); + WARN_ON(!PageLocked(page)); if (!PagePrivate(page)) return; - /* - * We can get non-dirty pages here due to races between - * set_page_dirty and truncate_complete_page; just spit out a - * warning, in case we end up with accounting problems later. - */ - if (!PageDirty(page)) - pr_err("%p invalidatepage %p page not dirty\n", inode, page); - ClearPageChecked(page); dout("%p invalidatepage %p idx %lu full dirty page\n", @@ -455,13 +448,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; - if (fsc->mount_options->rsize >= PAGE_SIZE) - max = (fsc->mount_options->rsize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - - dout("readpages %p file %p nr_pages %d max %d\n", inode, - file, nr_pages, - max); + max = fsc->mount_options->rsize >> PAGE_SHIFT; + dout("readpages %p file %p nr_pages %d max %d\n", + inode, file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); if (rc < 0) @@ -474,14 +463,22 @@ out: return rc; } +struct ceph_writeback_ctl +{ + loff_t i_size; + u64 truncate_size; + u32 truncate_seq; + bool size_stable; + bool head_snapc; +}; + /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. */ -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - loff_t *snap_size, - u64 *truncate_size, - u32 *truncate_seq) +static struct ceph_snap_context * +get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, + struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -491,30 +488,78 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); - if (capsnap->dirty_pages) { - snapc = ceph_get_snap_context(capsnap->context); - if (snap_size) - *snap_size = capsnap->size; - if (truncate_size) - *truncate_size = capsnap->truncate_size; - if (truncate_seq) - *truncate_seq = capsnap->truncate_seq; - break; + if (!capsnap->dirty_pages) + continue; + + /* get i_size, truncate_{seq,size} for page_snapc? */ + if (snapc && capsnap->context != page_snapc) + continue; + + if (ctl) { + if (capsnap->writing) { + ctl->i_size = i_size_read(inode); + ctl->size_stable = false; + } else { + ctl->i_size = capsnap->size; + ctl->size_stable = true; + } + ctl->truncate_size = capsnap->truncate_size; + ctl->truncate_seq = capsnap->truncate_seq; + ctl->head_snapc = false; } + + if (snapc) + break; + + snapc = ceph_get_snap_context(capsnap->context); + if (!page_snapc || + page_snapc == snapc || + page_snapc->seq > snapc->seq) + break; } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); - if (truncate_size) - *truncate_size = ci->i_truncate_size; - if (truncate_seq) - *truncate_seq = ci->i_truncate_seq; + if (ctl) { + ctl->i_size = i_size_read(inode); + ctl->truncate_size = ci->i_truncate_size; + ctl->truncate_seq = ci->i_truncate_seq; + ctl->size_stable = false; + ctl->head_snapc = true; + } } spin_unlock(&ci->i_ceph_lock); return snapc; } +static u64 get_writepages_data_length(struct inode *inode, + struct page *page, u64 start) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_cap_snap *capsnap = NULL; + u64 end = i_size_read(inode); + + if (snapc != ci->i_head_snapc) { + bool found = false; + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + if (!capsnap->writing) + end = capsnap->size; + found = true; + break; + } + } + spin_unlock(&ci->i_ceph_lock); + WARN_ON(!found); + } + if (end > page_offset(page) + PAGE_SIZE) + end = page_offset(page) + PAGE_SIZE; + return end > start ? end - start : 0; +} + /* * Write a single page, but leave the page locked. * @@ -526,30 +571,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct inode *inode; struct ceph_inode_info *ci; struct ceph_fs_client *fsc; - struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); - loff_t snap_size = -1; long writeback_stat; - u64 truncate_size; - u32 truncate_seq; int err, len = PAGE_SIZE; + struct ceph_writeback_ctl ceph_wbc; dout("writepage %p idx %lu\n", page, page->index); inode = page->mapping->host; ci = ceph_inode(inode); fsc = ceph_inode_to_client(inode); - osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ snapc = page_snap_context(page); - if (snapc == NULL) { + if (!snapc) { dout("writepage %p page %p not dirty?\n", inode, page); return 0; } - oldest = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, snapc); @@ -561,20 +601,18 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); - if (snap_size == -1) - snap_size = i_size_read(inode); - /* is this a partial page at end of file? */ - if (page_off >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_off >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", page, ceph_wbc.i_size); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); return 0; } - if (snap_size < page_off + len) - len = snap_size - page_off; + if (ceph_wbc.i_size < page_off + len) + len = ceph_wbc.i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", - inode, page, page->index, page_off, len, snapc); + dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", + inode, page, page->index, page_off, len, snapc, snapc->seq); writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > @@ -582,10 +620,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_osdc_writepages(osdc, ceph_vino(inode), - &ci->i_layout, snapc, - page_off, len, - truncate_seq, truncate_size, + err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, snapc, page_off, len, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { struct writeback_control tmp_wbc; @@ -746,31 +784,17 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start, end; - int range_whole = 0; - int should_loop = 1; - pgoff_t max_pages = 0, max_pages_ever = 0; + pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct pagevec pvec; - int done = 0; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; - int do_sync = 0; - loff_t snap_size, i_size; - u64 truncate_size; - u32 truncate_seq; + struct ceph_writeback_ctl ceph_wbc; + bool should_loop, range_whole = false; + bool stop, done = false; - /* - * Include a 'sync' in the OSD request if this is a data - * integrity write (e.g., O_SYNC write or fsync()), or if our - * cap is being revoked. - */ - if ((wbc->sync_mode == WB_SYNC_ALL) || - ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) - do_sync = 1; - dout("writepages_start %p dosync=%d (mode=%s)\n", - inode, do_sync, + dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); @@ -783,35 +807,17 @@ static int ceph_writepages_start(struct address_space *mapping, mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } - if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - if (wsize < PAGE_SIZE) - wsize = PAGE_SIZE; - max_pages_ever = wsize >> PAGE_SHIFT; pagevec_init(&pvec, 0); - /* where to start/end? */ - if (wbc->range_cyclic) { - start = mapping->writeback_index; /* Start from prev offset */ - end = -1; - dout(" cyclic, start at %lu\n", start); - } else { - start = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - should_loop = 0; - dout(" not cyclic, %lu to %lu\n", start, end); - } - index = start; + start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + index = start_index; retry: /* find oldest snap context with dirty data */ - ceph_put_snap_context(snapc); - snap_size = -1; - snapc = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + snapc = get_oldest_context(inode, &ceph_wbc, NULL); if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ @@ -821,40 +827,56 @@ retry: dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); - i_size = i_size_read(inode); - - if (last_snapc && snapc != last_snapc) { - /* if we switched to a newer snapc, restart our scan at the - * start of the original file range. */ - dout(" snapc differs from last pass, restarting at %lu\n", - index); - index = start; + should_loop = false; + if (ceph_wbc.head_snapc && snapc != last_snapc) { + /* where to start/end? */ + if (wbc->range_cyclic) { + index = start_index; + end = -1; + if (index > 0) + should_loop = true; + dout(" cyclic, start at %lu\n", index); + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = true; + dout(" not cyclic, %lu to %lu\n", index, end); + } + } else if (!ceph_wbc.head_snapc) { + /* Do not respect wbc->range_{start,end}. Dirty pages + * in that range can be associated with newer snapc. + * They are not writeable until we write all dirty pages + * associated with 'snapc' get written */ + if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) + should_loop = true; + dout(" non-head snapc, range whole\n"); } + + ceph_put_snap_context(last_snapc); last_snapc = snapc; - while (!done && index <= end) { - unsigned i; - int first; - pgoff_t strip_unit_end = 0; + stop = false; + while (!stop && index <= end) { int num_ops = 0, op_idx; - int pvec_pages, locked_pages = 0; + unsigned i, pvec_pages, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; - int want; + pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; - max_pages = max_pages_ever; + max_pages = wsize >> PAGE_SHIFT; get_more_pages: - first = -1; - want = min(end - index, - min((pgoff_t)PAGEVEC_SIZE, - max_pages - (pgoff_t)locked_pages) - 1) - + 1; + pvec_pages = min_t(unsigned, PAGEVEC_SIZE, + max_pages - locked_pages); + if (end - index < (u64)(pvec_pages - 1)) + pvec_pages = (unsigned)(end - index) + 1; + pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - want); + pvec_pages); dout("pagevec_lookup_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; @@ -871,11 +893,15 @@ get_more_pages: unlikely(page->mapping != mapping)) { dout("!dirty or !mapping %p\n", page); unlock_page(page); - break; + continue; } - if (!wbc->range_cyclic && page->index > end) { + if (page->index > end) { dout("end of range %p\n", page); - done = 1; + /* can't be range_cyclic (1st pass) because + * end == -1 in that case. */ + stop = true; + if (ceph_wbc.head_snapc) + done = true; unlock_page(page); break; } @@ -884,39 +910,37 @@ get_more_pages: unlock_page(page); break; } - if (wbc->sync_mode != WB_SYNC_NONE) { - dout("waiting on writeback %p\n", page); - wait_on_page_writeback(page); - } - if (page_offset(page) >= - (snap_size == -1 ? i_size : snap_size)) { - dout("%p page eof %llu\n", page, - (snap_size == -1 ? i_size : snap_size)); - done = 1; + if (page_offset(page) >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", + page, ceph_wbc.i_size); + /* not done if range_cyclic */ + stop = true; unlock_page(page); break; } if (PageWriteback(page)) { - dout("%p under writeback\n", page); - unlock_page(page); - break; + if (wbc->sync_mode == WB_SYNC_NONE) { + dout("%p under writeback\n", page); + unlock_page(page); + continue; + } + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); } /* only if matching snap context */ pgsnapc = page_snap_context(page); - if (pgsnapc->seq > snapc->seq) { - dout("page snapc %p %lld > oldest %p %lld\n", + if (pgsnapc != snapc) { + dout("page snapc %p %lld != oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); unlock_page(page); - if (!locked_pages) - continue; /* keep looking for snap */ - break; + continue; } if (!clear_page_dirty_for_io(page)) { dout("%p !clear_page_dirty_for_io\n", page); unlock_page(page); - break; + continue; } /* @@ -942,7 +966,7 @@ get_more_pages: break; } - num_ops = 1 + do_sync; + num_ops = 1; strip_unit_end = page->index + ((len - 1) >> PAGE_SHIFT); @@ -972,8 +996,6 @@ get_more_pages: } /* note position of first page in pvec */ - if (first < 0) - first = i; dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -984,8 +1006,10 @@ get_more_pages: BLK_RW_ASYNC); } - pages[locked_pages] = page; - locked_pages++; + + pages[locked_pages++] = page; + pvec.pages[i] = NULL; + len += PAGE_SIZE; } @@ -993,23 +1017,23 @@ get_more_pages: if (!locked_pages) goto release_pvec_pages; if (i) { - int j; - BUG_ON(!locked_pages || first < 0); + unsigned j, n = 0; + /* shift unused page to beginning of pvec */ + for (j = 0; j < pvec_pages; j++) { + if (!pvec.pages[j]) + continue; + if (n < j) + pvec.pages[n] = pvec.pages[j]; + n++; + } + pvec.nr = n; if (pvec_pages && i == pvec_pages && locked_pages < max_pages) { dout("reached end pvec, trying for more\n"); - pagevec_reinit(&pvec); + pagevec_release(&pvec); goto get_more_pages; } - - /* shift unused pages over in the pvec... we - * will need to release them below. */ - for (j = i; j < pvec_pages; j++) { - dout(" pvec leftover page %p\n", pvec.pages[j]); - pvec.pages[j-i+first] = pvec.pages[j]; - } - pvec.nr -= i-first; } new_request: @@ -1019,10 +1043,9 @@ new_request: req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, false); + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, false); if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, @@ -1031,8 +1054,8 @@ new_request: CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, true); + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + @@ -1048,7 +1071,7 @@ new_request: for (i = 0; i < locked_pages; i++) { u64 cur_offset = page_offset(pages[i]); if (offset + len != cur_offset) { - if (op_idx + do_sync + 1 == req->r_num_ops) + if (op_idx + 1 == req->r_num_ops) break; osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); @@ -1069,14 +1092,15 @@ new_request: len += PAGE_SIZE; } - if (snap_size != -1) { - len = min(len, snap_size - offset); + if (ceph_wbc.size_stable) { + len = min(len, ceph_wbc.i_size - offset); } else if (i == locked_pages) { /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ u64 min_len = len + 1 - PAGE_SIZE; - len = min(len, (u64)i_size_read(inode) - offset); + len = get_writepages_data_length(inode, pages[i - 1], + offset); len = max(len, min_len); } dout("writepages got pages at %llu~%llu\n", offset, len); @@ -1085,17 +1109,12 @@ new_request: 0, !!pool, false); osd_req_op_extent_update(req, op_idx, len); - if (do_sync) { - op_idx++; - osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); - } BUG_ON(op_idx + 1 != req->r_num_ops); pool = NULL; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; - num_ops += do_sync; locked_pages -= i; /* allocate new pages array for next request */ @@ -1127,22 +1146,50 @@ new_request: if (pages) goto new_request; - if (wbc->nr_to_write <= 0) - done = 1; + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) + done = stop = true; release_pvec_pages: dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, pvec.nr ? pvec.pages[0] : NULL); pagevec_release(&pvec); - - if (locked_pages && !done) - goto retry; } if (should_loop && !done) { /* more to do; loop back to beginning of file */ dout("writepages looping back to beginning of file\n"); - should_loop = 0; + end = start_index - 1; /* OK even when start_index == 0 */ + + /* to write dirty pages associated with next snapc, + * we need to wait until current writes complete */ + if (wbc->sync_mode != WB_SYNC_NONE && + start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc.head_snapc) { + struct page *page; + unsigned i, nr; + index = 0; + while ((index <= end) && + (nr = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + PAGEVEC_SIZE))) { + for (i = 0; i < nr; i++) { + page = pvec.pages[i]; + if (page_snap_context(page) != snapc) + continue; + wait_on_page_writeback(page); + } + pagevec_release(&pvec); + cond_resched(); + } + } + + start_index = 0; index = 0; goto retry; } @@ -1152,8 +1199,8 @@ release_pvec_pages: out: ceph_osdc_put_request(req); - ceph_put_snap_context(snapc); - dout("writepages done, rc = %d\n", rc); + ceph_put_snap_context(last_snapc); + dout("writepages dend - startone, rc = %d\n", rc); return rc; } @@ -1165,8 +1212,7 @@ out: static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { - struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, - NULL, NULL); + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); @@ -1211,8 +1257,7 @@ retry_locked: * this page is already dirty in another (older) snap * context! is it writeable now? */ - oldest = get_oldest_context(inode, NULL, NULL, NULL); - + oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); dout(" page %p snapc %p not current or oldest\n", diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 174d6e6569a8..a3ab265d3215 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -209,7 +209,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ - if (fsc->fscache == NULL) + if (!fsc->fscache) return; /* Only cache for regular files that are read only */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7007ae2a5ad2..157fe59fbabe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,13 +490,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, } /* - * if we are newly issued FILE_SHARED, mark dir not complete; we - * don't know what happened to this directory while we didn't - * have the cap. + * If FILE_SHARED is newly issued, mark dir not complete. We don't + * know what happened to this directory while we didn't have the cap. + * If FILE_SHARED is being revoked, also mark dir not complete. It + * stops on-going cached readdir. */ - if ((issued & CEPH_CAP_FILE_SHARED) && - (had & CEPH_CAP_FILE_SHARED) == 0) { - ci->i_shared_gen++; + if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { + if (issued & CEPH_CAP_FILE_SHARED) + ci->i_shared_gen++; if (S_ISDIR(ci->vfs_inode.i_mode)) { dout(" marking %p NOT complete\n", &ci->vfs_inode); __ceph_dir_clear_complete(ci); @@ -611,7 +612,7 @@ void ceph_add_cap(struct inode *inode, } if (flags & CEPH_CAP_FLAG_AUTH) { - if (ci->i_auth_cap == NULL || + if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { ci->i_auth_cap = cap; cap->mds_wanted = wanted; @@ -728,7 +729,7 @@ static void __touch_cap(struct ceph_cap *cap) struct ceph_mds_session *s = cap->session; spin_lock(&s->s_cap_lock); - if (s->s_cap_iterator == NULL) { + if (!s->s_cap_iterator) { dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); @@ -1248,7 +1249,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.mode = inode->i_mode; arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - arg.flags = 0; + if (list_empty(&ci->i_cap_snaps)) + arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; + else + arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; if (sync) arg.flags |= CEPH_CLIENT_CAPS_SYNC; @@ -1454,13 +1458,19 @@ retry: goto retry; } + // make sure flushsnap messages are sent in proper order. + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + __kick_flushing_caps(mdsc, session, ci, 0); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + } + __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) { *psession = session; - } else { + } else if (session) { mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); } @@ -1901,11 +1911,7 @@ ack: (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { - spin_lock(&mdsc->cap_dirty_lock); - oldest_flush_tid = __get_oldest_flush_tid(mdsc); - spin_unlock(&mdsc->cap_dirty_lock); - __kick_flushing_caps(mdsc, session, ci, - oldest_flush_tid); + __kick_flushing_caps(mdsc, session, ci, 0); ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) @@ -2110,7 +2116,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret < 0) goto out; @@ -3422,7 +3428,7 @@ retry: tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ - if (tcap->cap_id != t_cap_id || + if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_seq) < 0) { dout(" updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 4e2d112c982f..d635496ea189 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -24,7 +24,7 @@ static int mdsmap_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mdsmap *mdsmap; - if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) + if (!fsc->mdsc || !fsc->mdsc->mdsmap) return 0; mdsmap = fsc->mdsc->mdsmap; seq_printf(s, "epoch %d\n", mdsmap->m_epoch); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ef7240ace576..019c2036d36f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -377,8 +377,10 @@ more: } /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; - req->r_direct_hash = ceph_frag_value(frag); - __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + if (op == CEPH_MDS_OP_READDIR) { + req->r_direct_hash = ceph_frag_value(frag); + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + } if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3d48c415f3cb..65a6fa12c857 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -175,7 +175,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (cf == NULL) { + if (!cf) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } @@ -562,8 +562,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; size_t len = iov_iter_count(to); - dout("sync_read on file %p %llu~%u %s\n", file, off, - (unsigned)len, + dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (!len) @@ -788,7 +787,7 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); @@ -800,7 +799,6 @@ static void ceph_aio_retry_work(struct work_struct *work) } req->r_ops[0] = orig_req->r_ops[0]; - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = aio_req->mtime; req->r_data_offset = req->r_ops[0].extent.offset; @@ -847,8 +845,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_read_write (%s) on file %p %lld~%u\n", - (write ? "write" : "read"), file, pos, (unsigned)count); + dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", + (write ? "write" : "read"), file, pos, (unsigned)count, + snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -861,7 +860,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; } else { flags = CEPH_OSD_FLAG_READ; } @@ -874,8 +873,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, - /*include a 'startsync' command*/ - write ? 2 : 1, + 1, write ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ, flags, snapc, @@ -887,6 +885,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + len = size; pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { @@ -922,7 +925,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_SIZE - 1)); - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = mtime; } @@ -1048,7 +1050,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", + file, pos, (unsigned)count, snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -1060,7 +1063,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; @@ -1307,6 +1310,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; +retry_snap: inode_lock(inode); /* We can write back this queue in page reclaim */ @@ -1338,7 +1342,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } -retry_snap: /* FIXME: not complete since it doesn't account for being at quota */ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; @@ -1387,14 +1390,6 @@ retry_snap: &prealloc_cf); else written = ceph_sync_write(iocb, &data, pos, snapc); - if (written == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u" - "got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), - pos, (unsigned)count); - inode_lock(inode); - goto retry_snap; - } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1428,10 +1423,15 @@ retry_snap: ceph_cap_string(got)); ceph_put_cap_refs(ci, got); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)count); + goto retry_snap; + } + if (written >= 0) { if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; - written = generic_write_sync(iocb, written); } @@ -1481,13 +1481,13 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset += file->f_pos; break; case SEEK_DATA: - if (offset >= i_size) { + if (offset < 0 || offset >= i_size) { ret = -ENXIO; goto out; } break; case SEEK_HOLE: - if (offset >= i_size) { + if (offset < 0 || offset >= i_size) { ret = -ENXIO; goto out; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 220dfd87cbfa..373dab5173ca 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -52,7 +52,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) ino_t t = ceph_vino_to_ino(vino); inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); - if (inode == NULL) + if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { dout("get_inode created new inode %p %llx.%llx ino %llx\n", @@ -133,12 +133,9 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, } frag = kmalloc(sizeof(*frag), GFP_NOFS); - if (!frag) { - pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " - "frag %x\n", &ci->vfs_inode, - ceph_vinop(&ci->vfs_inode), f); + if (!frag) return ERR_PTR(-ENOMEM); - } + frag->frag = f; frag->split_by = 0; frag->mds = -1; @@ -1070,7 +1067,6 @@ out_unlock: spin_unlock(&dentry->d_lock); if (old_lease_session) ceph_put_mds_session(old_lease_session); - return; } /* @@ -1177,7 +1173,7 @@ retry_lookup: dn = d_alloc(parent, &dname); dout("d_alloc %p '%.*s' = %p\n", parent, dname.len, dname.name, dn); - if (dn == NULL) { + if (!dn) { dput(parent); err = -ENOMEM; goto done; @@ -1477,7 +1473,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct dentry *dn; struct inode *in; int err = 0, skipped = 0, ret, i; - struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; u32 frag = le32_to_cpu(rhead->args.readdir.frag); u32 last_hash = 0; @@ -1510,8 +1505,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - snapdir = ceph_get_snapdir(d_inode(parent)); - parent = d_find_alias(snapdir); dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", rinfo->dir_nr, parent); } else { @@ -1519,15 +1512,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, rinfo->dir_nr, parent); if (rinfo->dir_dir) ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); - } - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && - !(rinfo->hash_order && last_hash)) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); - req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); - req->r_readdir_cache_idx = 0; + if (ceph_frag_is_leftmost(frag) && + req->r_readdir_offset == 2 && + !(rinfo->hash_order && last_hash)) { + /* note dir version at start of readdir so we can + * tell if any dentries get dropped */ + req->r_dir_release_cnt = + atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = + atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } } cache_ctl.index = req->r_readdir_cache_idx; @@ -1566,7 +1562,7 @@ retry_lookup: dn = d_alloc(parent, &dname); dout("d_alloc %p '%.*s' = %p\n", parent, dname.len, dname.name, dn); - if (dn == NULL) { + if (!dn) { dout("d_alloc badness\n"); err = -ENOMEM; goto out; @@ -1650,10 +1646,6 @@ out: req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); - if (snapdir) { - iput(snapdir); - dput(parent); - } dout("readdir_prepopulate done\n"); return err; } @@ -1841,9 +1833,20 @@ retry: * possibly truncate them.. so write AND block! */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + struct ceph_cap_snap *capsnap; + to = ci->i_truncate_size; + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + // MDS should have revoked Frw caps + WARN_ON_ONCE(capsnap->writing); + if (capsnap->dirty_pages && capsnap->size > to) + to = capsnap->size; + } + spin_unlock(&ci->i_ceph_lock); dout("__do_pending_vmtruncate %p flushing snaps first\n", inode); - spin_unlock(&ci->i_ceph_lock); + + truncate_pagecache(inode, to); + filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 666a9f274832..9dd6b836ac9e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -408,7 +408,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *session; - if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; session = mdsc->sessions[mds]; dout("lookup_mds_session %p %d\n", session, @@ -483,7 +483,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, dout("register_session realloc to %d\n", newmax); sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); - if (sa == NULL) + if (!sa) goto fail_realloc; if (mdsc->sessions) { memcpy(sa, mdsc->sessions, @@ -731,9 +731,16 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = NULL; if (req->r_inode) { - inode = req->r_inode; - ihold(inode); - } else if (req->r_dentry) { + if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { + inode = req->r_inode; + ihold(inode); + } else { + /* req->r_dentry is non-null for LSSNAP request. + * fall-thru */ + WARN_ON_ONCE(!req->r_dentry); + } + } + if (!inode && req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; @@ -886,7 +893,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 /* Calculate serialized length of metadata */ metadata_bytes = 4; /* map length */ - for (i = 0; metadata[i][0] != NULL; ++i) { + for (i = 0; metadata[i][0]; ++i) { metadata_bytes += 8 + strlen(metadata[i][0]) + strlen(metadata[i][1]); metadata_key_count++; @@ -919,7 +926,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ceph_encode_32(&p, metadata_key_count); /* Two length-prefixed strings for each entry in the map */ - for (i = 0; metadata[i][0] != NULL; ++i) { + for (i = 0; metadata[i][0]; ++i) { size_t const key_len = strlen(metadata[i][0]); size_t const val_len = strlen(metadata[i][1]); @@ -1122,7 +1129,7 @@ static int iterate_session_caps(struct ceph_mds_session *session, spin_lock(&session->s_cap_lock); p = p->next; - if (cap->ci == NULL) { + if (!cap->ci) { dout("iterate_session_caps finishing cap %p removal\n", cap); BUG_ON(cap->session != session); @@ -1748,7 +1755,7 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int len, pos; unsigned seq; - if (dentry == NULL) + if (!dentry) return ERR_PTR(-EINVAL); retry: @@ -1771,7 +1778,7 @@ retry: len--; /* no leading '/' */ path = kmalloc(len+1, GFP_NOFS); - if (path == NULL) + if (!path) return ERR_PTR(-ENOMEM); pos = len; path[pos] = 0; /* trailing null */ @@ -2875,7 +2882,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } if (list_empty(&ci->i_cap_snaps)) { - snap_follows = 0; + snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; } else { struct ceph_cap_snap *capsnap = list_first_entry(&ci->i_cap_snaps, @@ -3133,7 +3140,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, newmap->m_epoch, oldmap->m_epoch); for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { - if (mdsc->sessions[i] == NULL) + if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; oldstate = ceph_mdsmap_get_state(oldmap, i); @@ -3280,7 +3287,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, mutex_lock(&session->s_mutex); session->s_seq++; - if (inode == NULL) { + if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); goto release; } @@ -3438,7 +3445,7 @@ static void delayed_work(struct work_struct *work) for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (s == NULL) + if (!s) continue; if (s->s_state == CEPH_MDS_SESSION_CLOSING) { dout("resending session close request for mds%d\n", @@ -3490,7 +3497,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) { + if (!mdsc->mdsmap) { kfree(mdsc); return -ENOMEM; } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 1a748cf88535..33ced4c22732 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -112,7 +112,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u16 mdsmap_ev; m = kzalloc(sizeof(*m), GFP_NOFS); - if (m == NULL) + if (!m) return ERR_PTR(-ENOMEM); ceph_decode_need(p, end, 1 + 1, bad); @@ -138,7 +138,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_num_mds = m->m_max_mds; m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); - if (m->m_info == NULL) + if (!m->m_info) goto nomem; /* pick out active nodes from mds_info (state > 0) */ @@ -232,7 +232,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); - if (info->export_targets == NULL) + if (!info->export_targets) goto nomem; for (j = 0; j < num_export_targets; j++) info->export_targets[j] = diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index dab5d6732345..1ffc8b426c1c 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -299,7 +299,8 @@ static int cmpu64_rev(const void *a, const void *b) /* * build the snap context for a given realm. */ -static int build_snap_context(struct ceph_snap_realm *realm) +static int build_snap_context(struct ceph_snap_realm *realm, + struct list_head* dirty_realms) { struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; @@ -313,7 +314,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent); + err = build_snap_context(parent, dirty_realms); if (err) goto fail; } @@ -332,7 +333,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) " (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, - (unsigned int) realm->cached_context->num_snaps); + (unsigned int)realm->cached_context->num_snaps); return 0; } @@ -373,7 +374,11 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - ceph_put_snap_context(realm->cached_context); + if (realm->cached_context) { + ceph_put_snap_context(realm->cached_context); + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); + } realm->cached_context = snapc; return 0; @@ -394,15 +399,16 @@ fail: /* * rebuild snap context for the given realm and all of its children. */ -static void rebuild_snap_realms(struct ceph_snap_realm *realm) +static void rebuild_snap_realms(struct ceph_snap_realm *realm, + struct list_head *dirty_realms) { struct ceph_snap_realm *child; dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm); + build_snap_context(realm, dirty_realms); list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child); + rebuild_snap_realms(child, dirty_realms); } @@ -624,13 +630,11 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) { struct ceph_inode_info *ci; struct inode *lastinode = NULL; - struct ceph_snap_realm *child; dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { + list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { struct inode *inode = igrab(&ci->vfs_inode); if (!inode) continue; @@ -643,14 +647,6 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); - list_for_each_entry(child, &realm->children, child_item) { - dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", - realm, realm->ino, child, child->ino); - list_del_init(&child->dirty_item); - list_add(&child->dirty_item, &realm->dirty_item); - } - - list_del_init(&realm->dirty_item); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -721,8 +717,6 @@ more: if (err < 0) goto fail; - /* queue realm for cap_snap creation */ - list_add(&realm->dirty_item, &dirty_realms); if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; @@ -741,7 +735,7 @@ more: /* invalidate when we reach the _end_ (root) of the trace */ if (invalidate && p >= e) - rebuild_snap_realms(realm); + rebuild_snap_realms(realm, &dirty_realms); if (!first_realm) first_realm = realm; @@ -758,6 +752,7 @@ more: while (!list_empty(&dirty_realms)) { realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, dirty_item); + list_del_init(&realm->dirty_item); queue_realm_cap_snaps(realm); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index aa06a8c24792..e4082afedcb1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -49,9 +49,16 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) struct ceph_statfs st; u64 fsid; int err; + u64 data_pool; + + if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { + data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; + } else { + data_pool = CEPH_NOPOOL; + } dout("statfs\n"); - err = ceph_monc_do_statfs(&fsc->client->monc, &st); + err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st); if (err < 0) return err; @@ -113,7 +120,6 @@ enum { Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, - Opt_cap_release_safety, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, @@ -152,7 +158,6 @@ static match_table_t fsopt_tokens = { {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_cap_release_safety, "cap_release_safety=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, @@ -235,27 +240,43 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ case Opt_wsize: - fsopt->wsize = intval; + if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) + return -EINVAL; + fsopt->wsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rsize: - fsopt->rsize = intval; + if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) + return -EINVAL; + fsopt->rsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rasize: - fsopt->rasize = intval; + if (intval < 0) + return -EINVAL; + fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: + if (intval < 1) + return -EINVAL; fsopt->caps_wanted_delay_min = intval; break; case Opt_caps_wanted_delay_max: + if (intval < 1) + return -EINVAL; fsopt->caps_wanted_delay_max = intval; break; case Opt_readdir_max_entries: + if (intval < 1) + return -EINVAL; fsopt->max_readdir = intval; break; case Opt_readdir_max_bytes: + if (intval < PAGE_SIZE && intval != 0) + return -EINVAL; fsopt->max_readdir_bytes = intval; break; case Opt_congestion_kb: + if (intval < 1024) /* at least 1M */ + return -EINVAL; fsopt->congestion_kb = intval; break; case Opt_dirstat: @@ -392,7 +413,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->sb_flags = flags; fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); if (!fsopt->snapdir_name) { @@ -402,7 +424,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); @@ -508,7 +529,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); - if (fsopt->rsize != CEPH_RSIZE_DEFAULT) + if (fsopt->rsize != CEPH_MAX_READ_SIZE) seq_printf(m, ",rsize=%d", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) seq_printf(m, ",rasize=%d", fsopt->rasize); @@ -520,9 +541,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) seq_printf(m, ",caps_wanted_delay_max=%d", fsopt->caps_wanted_delay_max); - if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) - seq_printf(m, ",cap_release_safety=%d", - fsopt->cap_release_safety); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) @@ -576,7 +594,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - if (fsopt->mds_namespace == NULL) { + if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); } else { @@ -597,13 +615,13 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, * to be processed in parallel, limit concurrency. */ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); - if (fsc->wb_wq == NULL) + if (!fsc->wb_wq) goto fail_client; fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); - if (fsc->pg_inv_wq == NULL) + if (!fsc->pg_inv_wq) goto fail_wb_wq; fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); - if (fsc->trunc_wq == NULL) + if (!fsc->trunc_wq) goto fail_pg_inv_wq; /* set up mempools */ @@ -674,26 +692,26 @@ static int __init init_caches(void) __alignof__(struct ceph_inode_info), SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| SLAB_ACCOUNT, ceph_inode_init_once); - if (ceph_inode_cachep == NULL) + if (!ceph_inode_cachep) return -ENOMEM; ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_cachep == NULL) + if (!ceph_cap_cachep) goto bad_cap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_flush_cachep == NULL) + if (!ceph_cap_flush_cachep) goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_dentry_cachep == NULL) + if (!ceph_dentry_cachep) goto bad_dentry; ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); - if (ceph_file_cachep == NULL) + if (!ceph_file_cachep) goto bad_file; if ((error = ceph_fscache_register())) @@ -947,20 +965,10 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) return err; /* set ra_pages based on rasize mount option? */ - if (fsc->mount_options->rasize >= PAGE_SIZE) - sb->s_bdi->ra_pages = - (fsc->mount_options->rasize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - else - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - - if (fsc->mount_options->rsize > fsc->mount_options->rasize && - fsc->mount_options->rsize >= PAGE_SIZE) - sb->s_bdi->io_pages = - (fsc->mount_options->rsize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - else if (fsc->mount_options->rsize == 0) - sb->s_bdi->io_pages = ULONG_MAX; + sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT; + + /* set io_pages based on max osd read size */ + sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT; return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f02a2225fe42..279a2f401cf5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -46,12 +46,25 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ +/* max size of osd read request, limited by libceph */ +#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN +/* osd has a configurable limitaion of max write size. + * CEPH_MSG_MAX_DATA_LEN should be small enough. */ +#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + struct ceph_mount_options { int flags; int sb_flags; @@ -61,7 +74,6 @@ struct ceph_mount_options { int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; - int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 11263f102e4c..3542b2c364cf 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -777,7 +777,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, spin_unlock(&ci->i_ceph_lock); /* security module gets xattr while filling trace */ - if (current->journal_info != NULL) { + if (current->journal_info) { pr_warn_ratelimited("sync getxattr %p " "during filling trace\n", inode); return -EBUSY; @@ -809,7 +809,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); - if (current->journal_info != NULL && + if (current->journal_info && !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: @@ -1058,7 +1058,7 @@ do_sync_unlocked: up_read(&mdsc->snap_rwsem); /* security module set xattr while filling trace */ - if (current->journal_info != NULL) { + if (current->journal_info) { pr_warn_ratelimited("sync setxattr %p " "during filling trace\n", inode); err = -EBUSY; @@ -1108,7 +1108,7 @@ bool ceph_security_xattr_deadlock(struct inode *in) { struct ceph_inode_info *ci; bool ret; - if (in->i_security == NULL) + if (!in->i_security) return false; ci = ceph_inode(in); spin_lock(&ci->i_ceph_lock); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index c0474ac6cbf2..274ab5586dd0 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -368,9 +368,10 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) goto out; while (1) { + loff_t pos = ctx->pos - 2; + /* read entries from the directory file */ - ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir, - sizeof(*vdir)); + ret = kernel_read(host_file, vdir, sizeof(*vdir), &pos); if (ret < 0) { pr_err("%s: read dir %s failed %d\n", __func__, coda_f2s(&cii->c_fid), ret); diff --git a/fs/coredump.c b/fs/coredump.c index 592683711c64..0eec03696707 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -161,7 +161,7 @@ static int cn_print_exe_file(struct core_name *cn) if (!exe_file) return cn_esc_printf(cn, "%s (path unknown)", current->comm); - pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) { ret = -ENOMEM; goto put_exe_file; @@ -734,7 +734,7 @@ static int dax_writeback_one(struct block_device *bdev, } dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - dax_flush(dax_dev, pgoff, kaddr, size); + dax_flush(dax_dev, kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -929,7 +929,7 @@ int __dax_zero_page_range(struct block_device *bdev, return rc; } memset(kaddr + offset, 0, size); - dax_flush(dax_dev, pgoff, kaddr + offset, size); + dax_flush(dax_dev, kaddr + offset, size); dax_read_unlock(id); } return 0; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9014479d0160..6b801186baa5 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -568,8 +568,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags * 1) The lower mount is ro * 2) The ecryptfs_encrypted_view mount option is specified */ - if (path.dentry->d_sb->s_flags & MS_RDONLY || - mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) s->s_flags |= MS_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 039e627194a9..c596e7c03424 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -47,7 +47,7 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; if (!lower_file) return -EIO; - rc = kernel_write(lower_file, data, size, offset); + rc = kernel_write(lower_file, data, size, &offset); mark_inode_dirty_sync(ecryptfs_inode); return rc; } @@ -237,7 +237,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; if (!lower_file) return -EIO; - return kernel_read(lower_file, offset, data, size); + return kernel_read(lower_file, data, size, &offset); } /** diff --git a/fs/efs/super.c b/fs/efs/super.c index 368f7dd21c61..5c42f1e34a2f 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -306,7 +306,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) } brelse(bh); - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { #ifdef DEBUG pr_info("forcing read-only mode\n"); #endif diff --git a/fs/exec.c b/fs/exec.c index 01a9fb9d8ac3..ac34d9724684 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -885,23 +885,6 @@ struct file *open_exec(const char *name) } EXPORT_SYMBOL(open_exec); -int kernel_read(struct file *file, loff_t offset, - char *addr, unsigned long count) -{ - mm_segment_t old_fs; - loff_t pos = offset; - int result; - - old_fs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - result = vfs_read(file, (void __user *)addr, count, &pos); - set_fs(old_fs); - return result; -} - -EXPORT_SYMBOL(kernel_read); - int kernel_read_file(struct file *file, void **buf, loff_t *size, loff_t max_size, enum kernel_read_file_id id) { @@ -939,8 +922,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, pos = 0; while (pos < i_size) { - bytes = kernel_read(file, pos, (char *)(*buf) + pos, - i_size - pos); + bytes = kernel_read(file, *buf + pos, i_size - pos, &pos); if (bytes < 0) { ret = bytes; goto out; @@ -948,7 +930,6 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, if (bytes == 0) break; - pos += bytes; } if (pos != i_size) { @@ -974,7 +955,7 @@ out: } EXPORT_SYMBOL_GPL(kernel_read_file); -int kernel_read_file_from_path(char *path, void **buf, loff_t *size, +int kernel_read_file_from_path(const char *path, void **buf, loff_t *size, loff_t max_size, enum kernel_read_file_id id) { struct file *file; @@ -1567,6 +1548,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) int prepare_binprm(struct linux_binprm *bprm) { int retval; + loff_t pos = 0; bprm_fill_uid(bprm); @@ -1577,7 +1559,7 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->called_set_creds = 1; memset(bprm->buf, 0, BINPRM_BUF_SIZE); - return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); + return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } EXPORT_SYMBOL(prepare_binprm); @@ -1763,9 +1745,9 @@ static int do_execveat_common(int fd, struct filename *filename, bprm->filename = filename->name; } else { if (filename->name[0] == '\0') - pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); + pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else - pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", + pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!pathbuf) { retval = -ENOMEM; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index fc18edd81815..1458706bd2ec 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -52,7 +52,7 @@ void ext2_error(struct super_block *sb, const char *function, struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { spin_lock(&sbi->s_lock); sbi->s_mount_state |= EXT2_ERROR_FS; es->s_state |= cpu_to_le16(EXT2_ERROR_FS); @@ -151,7 +151,7 @@ static void ext2_put_super (struct super_block * sb) ext2_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; } - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { struct ext2_super_block *es = sbi->s_es; spin_lock(&sbi->s_lock); @@ -943,8 +943,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) le32_to_cpu(features)); goto failed_mount; } - if (!(sb->s_flags & MS_RDONLY) && - (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ + if (!sb_rdonly(sb) && (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of " "unsupported optional features (%x)", le32_to_cpu(features)); @@ -1173,7 +1172,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) ext2_msg(sb, KERN_WARNING, "warning: mounting ext3 filesystem as ext2"); - if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY)) + if (ext2_setup_super (sb, es, sb_rdonly(sb))) sb->s_flags |= MS_RDONLY; ext2_write_super(sb); return 0; @@ -1305,7 +1304,7 @@ static int ext2_unfreeze(struct super_block *sb) static void ext2_write_super(struct super_block *sb) { - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) ext2_sync_fs(sb, 1); } @@ -1343,7 +1342,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) "dax flag with busy inodes while remounting"); sbi->s_mount_opt ^= EXT2_MOUNT_DAX; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) { spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index dd106b1d5d89..5b342ac67d2e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -47,7 +47,7 @@ static int ext4_journal_check_start(struct super_block *sb) if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return -EIO; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 57dcaea762c3..b1da660ac3bc 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -223,6 +223,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) @@ -371,7 +373,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return -EIO; if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && - !(sb->s_flags & MS_RDONLY))) { + !sb_rdonly(sb))) { sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; /* * Sample where the filesystem has been mounted and @@ -431,9 +433,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return ret; } - /* Set the flags to support nowait AIO */ - filp->f_mode |= FMODE_AIO_NOWAIT; - + filp->f_mode |= FMODE_NOWAIT; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index aae2c3971cef..f9230580a84b 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -107,7 +107,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); - if (inode->i_sb->s_flags & MS_RDONLY) { + if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 71e93a23cec3..ee823022aa34 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1382,7 +1382,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; /* This should not happen, but just to be sure check this */ - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { ret = 1; goto out; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 77cdce1f17ce..84c54f15f1dd 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -185,7 +185,7 @@ static int kmmpd(void *data) goto exit_thread; } - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { ext4_warning(sb, "kmmpd being stopped since filesystem " "has been remounted as readonly."); goto exit_thread; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 71b9a667e1bc..b104096fce9e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -405,7 +405,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) static void ext4_handle_error(struct super_block *sb) { - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; if (!test_opt(sb, ERRORS_CONT)) { @@ -587,8 +587,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ - if (errno == -EROFS && journal_current_handle() == NULL && - (sb->s_flags & MS_RDONLY)) + if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { @@ -628,7 +627,7 @@ void __ext4_abort(struct super_block *sb, const char *function, sb->s_id, function, line, &vaf); va_end(args); - if ((sb->s_flags & MS_RDONLY) == 0) { + if (sb_rdonly(sb) == 0) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; /* @@ -889,11 +888,11 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!(sb->s_flags & MS_RDONLY) && !aborted) { + if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) ext4_commit_super(sb, 1); for (i = 0; i < sbi->s_gdb_count; i++) @@ -2100,7 +2099,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset) struct super_block *sb = seq->private; int rc; - seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_puts(seq, "\n"); return rc; @@ -2368,7 +2367,7 @@ static int ext4_check_descriptors(struct super_block *sb, "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } @@ -3136,8 +3135,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (first_not_zeroed == ngroups || - (sb->s_flags & MS_RDONLY) || + if (first_not_zeroed == ngroups || sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) goto out; @@ -3683,7 +3681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ - if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) + if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) goto failed_mount; blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); @@ -3812,12 +3810,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif @@ -4017,7 +4015,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_journal_needs_recovery(sb)); - if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY)) + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) goto failed_mount3a; @@ -4029,7 +4027,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) err = ext4_load_journal(sb, es, journal_devnum); if (err) goto failed_mount3a; - } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); @@ -4143,7 +4141,7 @@ no_journal: goto failed_mount_wq; } - if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && !ext4_has_feature_encrypt(sb)) { ext4_set_feature_encrypt(sb); ext4_commit_super(sb, 1); @@ -4197,7 +4195,7 @@ no_journal: goto failed_mount4; } - if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + if (ext4_setup_super(sb, es, sb_rdonly(sb))) sb->s_flags |= MS_RDONLY; /* determine the minimum size of new large inodes, if present */ @@ -4285,7 +4283,7 @@ no_journal: #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ - if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) { + if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount8; @@ -4609,7 +4607,7 @@ static int ext4_load_journal(struct super_block *sb, * can get read-write access to the device. */ if (ext4_has_feature_journal_needs_recovery(sb)) { - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); if (really_read_only) { @@ -4764,8 +4762,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (jbd2_journal_flush(journal) < 0) goto out; - if (ext4_has_feature_journal_needs_recovery(sb) && - sb->s_flags & MS_RDONLY) { + if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { ext4_clear_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); } @@ -4821,7 +4818,7 @@ int ext4_force_commit(struct super_block *sb) { journal_t *journal; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; journal = EXT4_SB(sb)->s_journal; @@ -4886,7 +4883,7 @@ static int ext4_freeze(struct super_block *sb) int error = 0; journal_t *journal; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; journal = EXT4_SB(sb)->s_journal; @@ -4921,7 +4918,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if ((sb->s_flags & MS_RDONLY) || ext4_forced_shutdown(EXT4_SB(sb))) + if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) return 0; if (EXT4_SB(sb)->s_journal) { @@ -5059,7 +5056,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (*flags & MS_LAZYTIME) sb->s_flags |= MS_LAZYTIME; - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; @@ -5154,7 +5151,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * Reinitialize lazy itable initialization thread based on * current settings */ - if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) + if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) ext4_unregister_li_request(sb); else { ext4_group_t first_not_zeroed; @@ -5731,7 +5728,7 @@ static inline int ext2_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; @@ -5762,7 +5759,7 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; if (!ext4_has_feature_journal(sb)) return 0; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index b4b8438c42ef..436b3a1464d9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -207,15 +207,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5b876f6d3f6b..04fe1df052b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -230,8 +230,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -244,7 +245,7 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -263,6 +264,12 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -283,7 +290,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -295,7 +302,7 @@ skip_write: } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -346,7 +353,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -581,11 +588,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -601,14 +621,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -904,7 +931,14 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); /* We need to give cpu to another writers. */ if (ino == cur_ino) { @@ -1017,7 +1051,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1115,7 +1149,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1194,7 +1228,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1249,7 +1283,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fb96bb71da00..36b535207c88 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -457,14 +457,65 @@ out_fail: return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -508,8 +559,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -570,16 +621,6 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct page *page; struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -620,9 +661,7 @@ got_it: return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -756,7 +795,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; @@ -782,7 +822,7 @@ alloc: static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -814,7 +854,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -903,7 +943,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { @@ -1040,7 +1080,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1146,35 +1186,6 @@ out: return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1240,7 +1251,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: @@ -1271,12 +1282,11 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1341,11 +1351,11 @@ static int encrypt_one_page(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; gfp_t gfp_flags = GFP_NOFS; - if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + if (!f2fs_encrypted_file(inode)) return 0; /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@ -1471,7 +1481,8 @@ out: } static int __write_data_page(struct page *page, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1492,6 +1503,7 @@ static int __write_data_page(struct page *page, bool *submitted, .encrypted_page = NULL, .submitted = false, .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); @@ -1598,7 +1610,7 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, wbc); + return __write_data_page(page, NULL, wbc, FS_DATA_IO); } /* @@ -1607,7 +1619,8 @@ static int f2fs_write_data_page(struct page *page, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1697,7 +1710,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1752,8 +1765,9 @@ continue_unlock: return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1790,7 +1804,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, goto skip_write; blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) @@ -1809,6 +1823,16 @@ skip_write: return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -1858,7 +1882,7 @@ restart: set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) @@ -1956,8 +1980,8 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -1971,21 +1995,9 @@ repeat: zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); - goto fail; - } - bio->bi_opf = REQ_OP_READ; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -2075,10 +2087,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 37f9c7f55605..c0c933ad43c8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -705,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -735,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 94a88b233e98..9a7c90386947 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -91,6 +91,8 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -110,8 +112,12 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -142,6 +148,8 @@ enum { (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -190,11 +198,18 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + enum { D_PREP, D_SUBMIT, @@ -230,11 +245,14 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ @@ -308,6 +326,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_flush_device) #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -332,6 +351,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR + struct f2fs_gc_range { u32 sync; u64 start; @@ -355,16 +377,36 @@ struct f2fs_flush_device { u32 segments; /* # of segments to flush */ }; +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; static inline void make_dentry_ptr_block(struct inode *inode, @@ -372,19 +414,26 @@ static inline void make_dentry_ptr_block(struct inode *inode, { d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } static inline void make_dentry_ptr_inline(struct inode *inode, - struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) + struct f2fs_dentry_ptr *d, void *t) { + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + d->inode = inode; - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -473,12 +522,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -521,6 +571,7 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ @@ -533,10 +584,15 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -823,6 +879,23 @@ enum need_lock_type { LOCK_RETRY, }; +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ @@ -837,6 +910,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; #define is_read_io(rw) ((rw) == READ) @@ -1028,6 +1102,11 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -1046,10 +1125,19 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1137,6 +1225,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1760,20 +1869,38 @@ static inline bool IS_INODE(struct page *page) return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1836,6 +1963,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1864,6 +2005,8 @@ enum { FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1983,6 +2126,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -1999,6 +2144,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -2009,8 +2161,8 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) @@ -2069,11 +2221,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + int extra_size = get_extra_isize(inode); - return (void *)&(ri->i_addr[1]); + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2164,10 +2317,50 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} + /* * file.c */ @@ -2187,6 +2380,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); * inode.c */ void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); @@ -2255,6 +2450,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) @@ -2285,15 +2482,15 @@ int truncate_xattr_node(struct inode *inode, struct page *page); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *get_node_page_ra(struct page *parent, int start); void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); @@ -2314,6 +2511,7 @@ void destroy_node_manager_caches(void); /* * segment.c */ +bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); @@ -2336,7 +2534,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); int rewrite_data_page(struct f2fs_io_info *fio); @@ -2353,8 +2552,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, @@ -2377,7 +2575,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write); + long nr_to_write, enum iostat_type io_type); void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); @@ -2430,6 +2628,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -2726,10 +2927,10 @@ void destroy_extent_cache(void); /* * sysfs.c */ -int __init f2fs_register_sysfs(void); -void f2fs_unregister_sysfs(void); -int f2fs_init_sysfs(struct f2fs_sb_info *sbi); -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* * crypto support @@ -2739,6 +2940,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2761,6 +2967,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 843a0d99f7ea..517e112c8a9a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -98,14 +98,16 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); @@ -274,9 +276,19 @@ sync_nodes: goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); @@ -382,7 +394,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -467,9 +480,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -647,7 +664,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags; - flags = fi->i_flags & FS_FL_USER_VISIBLE; + flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); if (flags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & FS_COMPR_FL) @@ -927,7 +944,8 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -1003,8 +1021,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1173,7 +1191,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1184,8 +1203,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. @@ -1495,33 +1514,67 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_file_flush(struct file *file, fl_owner_t id) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; } static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags = fi->i_flags & + (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); return put_user(flags, (int __user *)arg); } +static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int oldflags; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; + + flags = f2fs_mask_flags(inode->i_mode, flags); + + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + flags = flags & (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); + flags |= oldflags & ~(FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); + fi->i_flags = flags; + + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + else + clear_inode_flag(inode, FI_PROJ_INHERIT); + + inode->i_ctime = current_time(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return 0; +} + static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags; - unsigned int oldflags; int ret; if (!inode_owner_or_capable(inode)) @@ -1536,31 +1589,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode_lock(inode); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - ret = -EPERM; - goto unlock_out; - } - - flags = f2fs_mask_flags(inode->i_mode, flags); - - oldflags = fi->i_flags; - - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - ret = -EPERM; - goto unlock_out; - } - } - - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; - fi->i_flags = flags; + ret = __f2fs_ioc_setflags(inode, flags); - inode->i_ctime = current_time(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, false); -unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1610,10 +1640,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); goto out; } inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: @@ -1647,10 +1679,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); @@ -1786,7 +1819,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -2043,7 +2076,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -2085,7 +2118,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -2384,6 +2417,210 @@ out: return ret; } +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + +#ifdef CONFIG_QUOTA +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct page *ipage; + kprojid_t kprojid; + int err; + + if (!f2fs_sb_has_project_quota(sb)) { + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (!f2fs_has_extra_attr(inode)) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + return 0; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = -EPERM; + inode_lock(inode); + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto out_unlock; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out_unlock; + } + + if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, + i_projid)) { + err = -EOVERFLOW; + f2fs_put_page(ipage, 1); + goto out_unlock; + } + f2fs_put_page(ipage, 1); + + dquot_initialize(inode); + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + + F2FS_I(inode)->i_projid = kprojid; + inode->i_ctime = current_time(inode); +out_dirty: + f2fs_mark_inode_dirty_sync(inode, true); +out_unlock: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} +#else +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & FS_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & FS_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & FS_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & FS_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & FS_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & FS_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +#define F2FS_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define F2FS_FL_XFLAG_VISIBLE (FS_SYNC_FL | \ + FS_IMMUTABLE_FL | \ + FS_APPEND_FL | \ + FS_NODUMP_FL | \ + FS_NOATIME_FL | \ + FS_PROJINHERIT_FL) + +/* Transfer xflags flags to internal */ +static inline unsigned long f2fs_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= FS_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= FS_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= FS_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= FS_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= FS_PROJINHERIT_FL; + + return iflags; +} + +static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags & + (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)); + + if (f2fs_sb_has_project_quota(inode->i_sb)) + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + fi->i_projid); + + if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + unsigned int flags; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (fa.fsx_xflags & ~F2FS_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = f2fs_xflags_to_iflags(fa.fsx_xflags); + if (f2fs_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + + err = mnt_want_write_file(filp); + if (err) + return err; + + inode_lock(inode); + flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) | + (flags & F2FS_FL_XFLAG_VISIBLE); + err = __f2fs_ioc_setflags(inode, flags); + inode_unlock(inode); + mnt_drop_write_file(filp); + if (err) + return err; + + err = f2fs_ioc_setproject(filp, fa.fsx_projid); + if (err) + return err; + + return 0; +} long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -2426,6 +2663,12 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_move_range(filp, arg); case F2FS_IOC_FLUSH_DEVICE: return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_FSGETXATTR: + return f2fs_ioc_fsgetxattr(filp, arg); + case F2FS_IOC_FSSETXATTR: + return f2fs_ioc_fssetxattr(filp, arg); default: return -ENOTTY; } @@ -2455,6 +2698,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); @@ -2491,6 +2737,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_FSGETXATTR: + case F2FS_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; @@ -2506,6 +2755,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fa3d2e2df8e7..bfe6a8ccc3a0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,16 +28,21 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { wait_event_interruptible_timeout(*wq, - kthread_should_stop() || freezing(current), + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; if (kthread_should_stop()) @@ -55,6 +60,9 @@ static int gc_thread_func(void *data) } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -69,19 +77,24 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; + + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ @@ -93,6 +106,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; @@ -110,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -259,20 +277,11 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } -static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - struct seg_entry *se = get_seg_entry(sbi, segno); - - return se->ckpt_valid_blocks > se->valid_blocks ? - se->ckpt_valid_blocks : se->valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_ssr_cost(sbi, segno); + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) @@ -582,7 +591,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) @@ -590,8 +599,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -684,6 +697,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -731,6 +746,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .page = page, .encrypted_page = NULL, .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; @@ -819,8 +835,7 @@ next_step: continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -854,14 +869,18 @@ next_step: continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx, segno, off); + if (f2fs_encrypted_file(inode)) + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -898,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -944,6 +963,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } @@ -954,21 +977,17 @@ next: blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, true) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -976,6 +995,15 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { @@ -1002,17 +1030,20 @@ gc_more: gc_type = FG_GC; } - ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) + if (gc_type == BG_GC && !background) { + ret = -EINVAL; goto stop; - if (!__get_victim(sbi, &segno, gc_type)) + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; goto stop; - ret = 0; + } - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; @@ -1029,6 +1060,16 @@ gc_more: stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { @@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e0fd4376e6fb..8322e4e7bb3f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -22,10 +22,10 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; @@ -44,6 +44,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -51,12 +52,12 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) @@ -67,13 +68,13 @@ void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) + if (from >= MAX_INLINE_DATA(inode)) return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); if (from == 0) @@ -116,6 +117,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; @@ -200,6 +202,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -216,11 +220,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -255,9 +264,9 @@ process_inline: f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,11 +294,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -300,9 +309,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -316,19 +325,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; + void *inline_dentry; - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -337,11 +346,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -356,25 +366,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -395,14 +404,13 @@ out: return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -444,19 +452,19 @@ punch_dentry_pages: } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -473,9 +481,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -484,7 +492,7 @@ recover: } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -500,7 +508,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *inline_dentry = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,10 +518,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); - bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; @@ -534,7 +543,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -557,7 +565,8 @@ out: void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -565,11 +574,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -586,20 +596,21 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *inline_dentry; + void *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - inline_dentry = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -609,25 +620,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; @@ -652,7 +665,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -661,7 +674,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6cd312a17c69..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -49,20 +49,22 @@ void f2fs_set_inode_flags(struct inode *inode) static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { @@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -292,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -416,6 +519,9 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 760d85223c81..a4dab98c4b7b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -58,6 +58,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto fail_drop; @@ -72,6 +79,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -85,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -204,6 +225,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -261,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -724,6 +754,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; @@ -912,6 +947,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d53fe620939e..fca87835a1da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -554,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -578,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -613,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -654,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -876,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -1022,11 +1028,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; @@ -1170,6 +1175,11 @@ repeat: err = -EIO; goto out_err; } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " @@ -1177,9 +1187,9 @@ page_hit: nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - ClearPageUptodate(page); err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); return ERR_PTR(err); } @@ -1326,7 +1336,8 @@ continue_unlock: } static int __write_node_page(struct page *page, bool atomic, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1339,6 +1350,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .io_type = io_type, }; trace_f2fs_writepage(page, NODE); @@ -1395,6 +1407,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (submitted) *submitted = fio.submitted; + if (do_balance) + f2fs_balance_fs(sbi, false); return 0; redirty_out: @@ -1405,7 +1419,7 @@ redirty_out: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc); + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1493,7 +1507,8 @@ continue_unlock: ret = __write_node_page(page, atomic && page == last_page, - &submitted, wbc); + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1530,7 +1545,8 @@ out: return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1608,7 +1624,8 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - ret = __write_node_page(page, false, &submitted, wbc); + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type); if (ret) unlock_page(page); else if (submitted) @@ -1697,7 +1714,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -2191,7 +2208,8 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; struct page *xpage; @@ -2207,22 +2225,22 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_xnid: /* 2: update xattr nid in inode */ - remove_free_nid(sbi, new_xnid); - f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(inc_valid_node_count(sbi, inode, false))) - f2fs_bug_on(sbi, 1); + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + alloc_nid_done(sbi, new_xnid); update_inode_page(inode); /* 3: update and set xattr node page dirty */ - xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); - if (!xpage) - return -ENOMEM; - - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -2262,7 +2280,14 @@ retry: dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 907d6b7dde6a..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ retry: err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -226,18 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { + bool quota_inode = false; + if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -291,7 +317,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; @@ -328,10 +354,18 @@ got_it: f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -361,7 +395,8 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -414,8 +449,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) @@ -557,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -573,11 +623,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -586,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -599,8 +649,6 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ @@ -614,5 +662,12 @@ out: } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6f8fc4a6e701..621b9b3d320b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -17,10 +17,12 @@ #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/sched/signal.h> #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -167,6 +169,21 @@ found: return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -213,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } @@ -248,6 +271,7 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } @@ -292,6 +316,7 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -309,17 +334,21 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; last_idx = page->index; @@ -481,6 +510,8 @@ repeat: if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -499,6 +530,8 @@ repeat: fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -519,8 +552,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return ret; } - if (!atomic_read(&fcc->issing_flush)) { - atomic_inc(&fcc->issing_flush); + if (atomic_inc_return(&fcc->issing_flush) == 1) { ret = submit_flush_wait(sbi); atomic_dec(&fcc->issing_flush); @@ -530,18 +562,39 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) init_completion(&cmd.wait); - atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->issing_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; @@ -778,11 +831,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); - size = min((unsigned long)(end - blk), max_blocks); + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); - blk += size; + blk = START_BLOCK(sbi, segno + 1); } #endif } @@ -815,6 +871,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, submit_bio(bio); list_move_tail(&dc->list, &dcc->wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); } } else { __remove_discard_cmd(sbi, dc); @@ -996,32 +1054,81 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0; + int iter = 0, issued = 0; + int i; + bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (!issue_cond || is_idle(sbi)) + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { + __submit_discard_cmd(sbi, dc); + issued++; + + if (fatal_signal_pending(current)) + break; + continue; + } + + if (!issue_cond) { __submit_discard_cmd(sbi, dc); - if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + issued++; + continue; + } + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) goto out; } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } out: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); } static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, @@ -1102,34 +1209,63 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super */ +/* This comes from f2fs_put_super and f2fs_trim_fs */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); __wait_discard_cmd(sbi, false); } +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; set_freezable(); do { - wait_event_interruptible(*q, kthread_should_stop() || - freezing(current) || - atomic_read(&dcc->discard_cmd_cnt)); + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; if (kthread_should_stop()) return 0; - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + if (dcc->discard_wake) { + dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } + + sb_end_intwrite(sbi->sb); - congestion_wait(BLK_RW_SYNC, HZ/50); } while (!kthread_should_stop()); return 0; } @@ -1320,7 +1456,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; @@ -1402,11 +1539,11 @@ skip: goto find_next; list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= total_len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + wake_up_discard_thread(sbi, false); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) @@ -1424,9 +1561,13 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) + for (i = 0; i < MAX_PLIST_NUM; i++) { INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); @@ -1491,6 +1632,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1507,17 +1652,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; @@ -1528,17 +1681,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se->ckpt_valid_blocks++; } } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1900,7 +2061,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1921,12 +2082,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) @@ -1990,7 +2149,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2083,6 +2242,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -2202,9 +2364,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + if (add_list) { struct f2fs_bio_info *io; @@ -2236,7 +2401,8 @@ reallocate: } } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -2255,6 +2421,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) set_page_writeback(page); f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2263,6 +2431,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -2276,13 +2446,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - return f2fs_submit_page_bio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -2324,7 +2503,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -2343,7 +2522,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } @@ -2382,8 +2561,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 6b871b492fd5..e0a6cc23ace3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -492,29 +492,11 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); -} - static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { @@ -577,6 +559,10 @@ static inline bool need_inplace_update_policy(struct inode *inode, if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -799,3 +785,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 32e4c025e97e..89f61eb3d167 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,6 +25,7 @@ #include <linux/quotaops.h> #include <linux/f2fs_fs.h> #include <linux/sysfs.h> +#include <linux/quota.h> #include "f2fs.h" #include "node.h" @@ -107,8 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, Opt_usrquota, Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -144,8 +157,20 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; @@ -157,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -168,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -175,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -386,15 +512,76 @@ static int parse_options(struct super_block *sb, char *options) sb->s_flags &= ~MS_LAZYTIME; break; #ifdef CONFIG_QUOTA + case Opt_quota: case Opt_usrquota: set_opt(sbi, USRQUOTA); break; case Opt_grpquota: set_opt(sbi, GRPQUOTA); break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; #else + case Opt_quota: case Opt_usrquota: case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: f2fs_msg(sb, KERN_INFO, "quota operations not supported"); break; @@ -406,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -439,6 +630,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); #ifdef CONFIG_QUOTA memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); @@ -446,6 +638,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) #endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -584,7 +777,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -642,7 +834,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->ckpt); - f2fs_exit_sysfs(sbi); + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -651,6 +843,10 @@ static void f2fs_put_super(struct super_block *sb) destroy_device_list(sbi); mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); @@ -664,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -698,6 +897,48 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -733,9 +974,49 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -809,11 +1090,16 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) sbi->fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); if (test_opt(sbi, USRQUOTA)) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); #endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } @@ -862,6 +1148,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we @@ -871,6 +1162,23 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -952,6 +1260,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -966,6 +1279,13 @@ restore_gc: stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; @@ -1065,7 +1385,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return err; + return 0; inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -1082,6 +1402,27 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) return &F2FS_I(inode)->i_reserved_quota; } +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + static int f2fs_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); @@ -1119,7 +1460,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, struct inode *inode; int err; - err = f2fs_quota_sync(sb, -1); + err = f2fs_quota_sync(sb, type); if (err) return err; @@ -1147,7 +1488,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - f2fs_quota_sync(sb, -1); + f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); if (err) @@ -1163,7 +1504,7 @@ out_put: return err; } -static void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { int type; @@ -1171,6 +1512,12 @@ static void f2fs_quota_off_umount(struct super_block *sb) f2fs_quota_off(sb, type); } +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} + static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, .write_dquot = dquot_commit, @@ -1180,6 +1527,7 @@ static const struct dquot_operations f2fs_quota_operations = { .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = f2fs_get_projid, .get_next_id = dquot_get_next_id, }; @@ -1194,12 +1542,12 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else -static inline void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { } #endif -static struct super_operations f2fs_sops = { +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, @@ -1303,9 +1651,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); @@ -1922,6 +2277,11 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware @@ -1956,7 +2316,7 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; sb->s_qcop = &f2fs_quotactl_ops; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif sb->s_op = &f2fs_sops; @@ -1980,6 +2340,10 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1: NR_TEMP_TYPE; int j; @@ -2098,11 +2462,6 @@ try_onemore: if (err) goto free_nm; - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { @@ -2122,10 +2481,15 @@ try_onemore: goto free_root_inode; } - err = f2fs_init_sysfs(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); + if (err) + goto free_sysfs; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -2135,7 +2499,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_sysfs; + goto free_meta; } if (need_fsck) @@ -2149,7 +2513,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_sysfs; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2173,7 +2537,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_sysfs; + goto free_meta; } kfree(options); @@ -2191,9 +2555,17 @@ skip_recovery: f2fs_update_time(sbi, REQ_TIME); return 0; -free_sysfs: +free_meta: f2fs_sync_inode_meta(sbi); - f2fs_exit_sysfs(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2202,13 +2574,6 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); f2fs_destroy_stats(sbi); @@ -2228,6 +2593,10 @@ free_options: for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); @@ -2311,7 +2680,7 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_register_sysfs(); + err = f2fs_init_sysfs(); if (err) goto free_extent_cache; err = register_shrinker(&f2fs_shrinker_info); @@ -2330,7 +2699,7 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2350,7 +2719,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 71191d89917d..e2c258f717cd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -18,7 +18,6 @@ #include "gc.h" static struct proc_dir_entry *f2fs_proc_root; -static struct kset *f2fs_kset; /* Sysfs support for f2fs */ enum { @@ -41,6 +40,7 @@ struct f2fs_attr { const char *, size_t); int struct_type; int offset; + int id; }; static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) @@ -76,6 +76,34 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, BD_PART_WRITTEN(sbi))); } +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -124,7 +152,39 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, spin_unlock(&sbi->stat_lock); return count; } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + + *ui = t; + return count; + } + *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + return count; } @@ -155,6 +215,30 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -172,12 +256,23 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); @@ -191,20 +286,36 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), ATTR_LIST(gc_min_sleep_time), ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), @@ -217,26 +328,59 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), ATTR_LIST(reserved_blocks), NULL, }; +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, }; -static struct kobj_type f2fs_ktype = { +static struct kobj_type f2fs_sb_ktype = { .default_attrs = f2fs_attrs, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; @@ -288,6 +432,48 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + #define F2FS_PROC_FILE_DEF(_name) \ static int _name##_open_fs(struct inode *inode, struct file *file) \ { \ @@ -303,28 +489,47 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); -int __init f2fs_register_sysfs(void) +int __init f2fs_init_sysfs(void) { - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + int ret; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) - return -ENOMEM; - return 0; + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; } -void f2fs_unregister_sysfs(void) +void f2fs_exit_sysfs(void) { - kset_unregister(f2fs_kset); + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; } -int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) { struct super_block *sb = sbi->sb; int err; + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -333,33 +538,19 @@ int f2fs_init_sysfs(struct f2fs_sb_info *sbi) &f2fs_seq_segment_info_fops, sb); proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); - if (err) - goto err_out; return 0; -err_out: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - return err; } -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + kobject_del(&sbi->s_kobj); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 832c5110abab..7c65540148f8 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -442,7 +442,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); @@ -473,8 +473,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; + down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -503,7 +505,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -686,7 +690,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 1d9a8c4e9de0..48b2336692f9 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -309,7 +309,7 @@ static void mark_fsinfo_dirty(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - if (sb->s_flags & MS_RDONLY || sbi->fat_bits != 32) + if (sb_rdonly(sb) || sbi->fat_bits != 32) return; __mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a2c05f2ada6d..30c52394a7ad 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -657,7 +657,7 @@ static void fat_set_state(struct super_block *sb, struct msdos_sb_info *sbi = MSDOS_SB(sb); /* do not change any thing if mounted read only */ - if ((sb->s_flags & MS_RDONLY) && !force) + if (sb_rdonly(sb) && !force) return; /* do not change state if fs was dirty */ @@ -787,7 +787,7 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) /* make sure we update state on remount. */ new_rdonly = *flags & MS_RDONLY; - if (new_rdonly != (sb->s_flags & MS_RDONLY)) { + if (new_rdonly != sb_rdonly(sb)) { if (new_rdonly) fat_set_state(sb, 0, 0); else diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 8a8698119ff7..acc3aa30ee54 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -32,7 +32,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) if (opts->errors == FAT_ERRORS_PANIC) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); - else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { + else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) { sb->s_flags |= MS_RDONLY; fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } diff --git a/fs/file_table.c b/fs/file_table.c index 72e861a35a7f..61517f57f8ef 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -233,12 +233,10 @@ static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); - struct llist_node *next; + struct file *f, *t; - for (; node; node = next) { - next = llist_next(node); - __fput(llist_entry(node, struct file, f_u.fu_llist)); - } + llist_for_each_entry_safe(f, t, node, f_u.fu_llist) + __fput(f); } static void ____fput(struct callback_head *work) @@ -312,7 +310,7 @@ void put_filp(struct file *file) } void __init files_init(void) -{ +{ filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); @@ -331,4 +329,4 @@ void __init files_maxfiles_init(void) n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); -} +} diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 67f940892ef8..b5ab06fabc60 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -262,7 +262,8 @@ static int fscache_objlist_show(struct seq_file *m, void *v) type = "DT"; break; default: - sprintf(_type, "%02u", cookie->def->type); + snprintf(_type, sizeof(_type), "%02u", + cookie->def->type); type = _type; break; } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index c5b6b7165489..e9e97803442a 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt) static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); loff_t pos = 0; return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); @@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); loff_t pos = 0; /* * No locking or generic_write_checks(), the server is diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16d00e53264..13c65dd2d37d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_in *in; unsigned reqsize; - if (task_active_pid_ns(current) != fc->pid_ns) - return -EIO; - restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; @@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, in = &req->in; reqsize = in->h.len; + + if (task_active_pid_ns(current) != fc->pid_ns) { + rcu_read_lock(); + in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); + rcu_read_unlock(); + } + /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; @@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; - if (task_active_pid_ns(current) != fc->pid_ns) - return -EIO; - if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 00800c07ba1c..622081b97426 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -923,33 +923,29 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, return err; } -int fuse_update_attributes(struct inode *inode, struct kstat *stat, - struct file *file, bool *refreshed) +static int fuse_update_get_attr(struct inode *inode, struct file *file, + struct kstat *stat) { struct fuse_inode *fi = get_fuse_inode(inode); - int err; - bool r; + int err = 0; if (time_before64(fi->i_time, get_jiffies_64())) { - r = true; forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); - } else { - r = false; - err = 0; - if (stat) { - generic_fillattr(inode, stat); - stat->mode = fi->orig_i_mode; - stat->ino = fi->orig_ino; - } + } else if (stat) { + generic_fillattr(inode, stat); + stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; } - if (refreshed != NULL) - *refreshed = r; - return err; } +int fuse_update_attributes(struct inode *inode, struct file *file) +{ + return fuse_update_get_attr(inode, file, NULL); +} + int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { @@ -1786,7 +1782,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_attributes(inode, stat, NULL, NULL); + return fuse_update_get_attr(inode, NULL, stat); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d66789804287..cb7dff5c45d7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -645,7 +645,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { - struct file *file = io->file; + struct file *file = io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -707,7 +707,8 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, static int fuse_do_readpage(struct file *file, struct page *page) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct kiocb iocb; + struct fuse_io_priv io; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -735,6 +736,8 @@ static int fuse_do_readpage(struct file *file, struct page *page) req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = count; + init_sync_kiocb(&iocb, file); + io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; @@ -923,7 +926,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); + err = fuse_update_attributes(inode, iocb->ki_filp); if (err) return err; } @@ -957,13 +960,18 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { - struct file *file = io->file; + struct kiocb *iocb = io->iocb; + struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; struct fuse_write_in *inarg = &req->misc.write.in; fuse_write_fill(req, ff, pos, count); inarg->flags = file->f_flags; + if (iocb->ki_flags & IOCB_DSYNC) + inarg->flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + inarg->flags |= O_SYNC; if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); @@ -993,14 +1001,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, +static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, struct inode *inode, loff_t pos, size_t count) { size_t res; unsigned offset; unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); @@ -1100,7 +1108,7 @@ static inline unsigned fuse_wr_pages(loff_t pos, size_t len) FUSE_MAX_PAGES_PER_REQ); } -static ssize_t fuse_perform_write(struct file *file, +static ssize_t fuse_perform_write(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *ii, loff_t pos) { @@ -1133,7 +1141,7 @@ static ssize_t fuse_perform_write(struct file *file, } else { size_t num_written; - num_written = fuse_send_write_pages(req, file, inode, + num_written = fuse_send_write_pages(req, iocb, inode, pos, count); err = req->out.h.error; if (!err) { @@ -1169,7 +1177,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (get_fuse_conn(inode)->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, NULL, file, NULL); + err = fuse_update_attributes(mapping->host, file); if (err) return err; @@ -1201,7 +1209,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) pos += written; - written_buffered = fuse_perform_write(file, mapping, from, pos); + written_buffered = fuse_perform_write(iocb, mapping, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1220,13 +1228,15 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) written += written_buffered; iocb->ki_pos = pos + written_buffered; } else { - written = fuse_perform_write(file, mapping, from, iocb->ki_pos); + written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); if (written >= 0) iocb->ki_pos += written; } out: current->backing_dev_info = NULL; inode_unlock(inode); + if (written > 0) + written = generic_write_sync(iocb, written); return written ? written : err; } @@ -1317,7 +1327,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, { int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; - struct file *file = io->file; + struct file *file = io->iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -1399,8 +1409,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, loff_t *ppos) { ssize_t res; - struct file *file = io->file; - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(io->iocb->ki_filp); if (is_bad_inode(inode)) return -EIO; @@ -1414,15 +1423,14 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); return __fuse_direct_read(&io, to, &iocb->ki_pos); } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; if (is_bad_inode(inode)) @@ -2181,9 +2189,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; - if (pid && pid_nr == 0) - return -EOVERFLOW; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fc, &args); @@ -2303,7 +2308,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, NULL, file, NULL); + err = fuse_update_attributes(inode, file); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2323,7 +2328,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, NULL, file, NULL); + retval = fuse_update_attributes(inode, file); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); @@ -2874,7 +2879,6 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) io->offset = offset; io->write = (iov_iter_rw(iter) == WRITE); io->err = 0; - io->file = file; /* * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bd4d2a3e1ec1..d5773ca67ad2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -252,16 +252,15 @@ struct fuse_io_priv { bool should_dirty; int err; struct kiocb *iocb; - struct file *file; struct completion *done; bool blocking; }; -#define FUSE_IO_PRIV_SYNC(f) \ +#define FUSE_IO_PRIV_SYNC(i) \ { \ .refcnt = KREF_INIT(1), \ .async = 0, \ - .file = f, \ + .iocb = i, \ } /** @@ -905,8 +904,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); void fuse_update_ctime(struct inode *inode); -int fuse_update_attributes(struct inode *inode, struct kstat *stat, - struct file *file, bool *refreshed); +int fuse_update_attributes(struct inode *inode, struct file *file); void fuse_flush_writepages(struct inode *inode); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index dac6559e2195..cdd1c5f06f45 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -554,7 +554,7 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) struct gfs2_inode *ip = gl->gl_object; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) + if (!remote || sb_rdonly(sdp->sd_vfs)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 84593587691d..a3711f543405 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1037,7 +1037,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) char ro[20]; char spectator[20]; char *envp[] = { ro, spectator, NULL }; - sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(ro, "RDONLY=%d", sb_rdonly(sb)); sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); } @@ -1179,7 +1179,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_per_node; } - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e647938432bd..e700fb162664 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -452,7 +452,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) *qdp = NULL; - if (sdp->sd_vfs->s_flags & MS_RDONLY) + if (sb_rdonly(sdp->sd_vfs)) return 0; spin_lock(&qd_lock); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 113b6095a58d..9395a3db1a60 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -522,7 +522,7 @@ void gfs2_recover_func(struct work_struct *work) if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { - if (sdp->sd_vfs->s_flags & MS_RDONLY) { + if (sb_rdonly(sdp->sd_vfs)) { /* check if device itself is read-only */ ro = bdev_read_only(sdp->sd_vfs->s_bdev); if (!ro) { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 769841185ce5..8e54f2e3a304 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -893,7 +893,7 @@ restart: } spin_unlock(&sdp->sd_jindex_spin); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { error = gfs2_make_fs_ro(sdp); if (error) gfs2_io_error(sdp); @@ -1569,7 +1569,7 @@ static void gfs2_evict_inode(struct inode *inode) return; } - if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) + if (inode->i_nlink || sb_rdonly(sb)) goto out; if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index ca1f97ff898c..9eb9d0a1abd9 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -645,7 +645,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) char *envp[] = { ro, spectator, NULL }; int sysfs_frees_sdp = 0; - sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(ro, "RDONLY=%d", sb_rdonly(sb)); sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); sdp->sd_kobj.kset = gfs2_kset; diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 482081bcdf70..894994d2c885 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -210,7 +210,7 @@ int hfs_mdb_get(struct super_block *sb) pr_warn("filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { /* Mark the volume uncleanly unmounted in case we crash */ attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT); attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT); @@ -259,7 +259,7 @@ void hfs_mdb_commit(struct super_block *sb) { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; lock_buffer(HFS_SB(sb)->mdb_bh); @@ -334,7 +334,7 @@ void hfs_mdb_commit(struct super_block *sb) void hfs_mdb_close(struct super_block *sb) { /* update volume attributes */ - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index bf6304a350a6..7e0d65e9586c 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -71,7 +71,7 @@ void hfs_mark_mdb_dirty(struct super_block *sb) struct hfs_sb_info *sbi = HFS_SB(sb); unsigned long delay; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; spin_lock(&sbi->work_lock); @@ -115,7 +115,7 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); *flags |= MS_NODIRATIME; - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) return 0; if (!(*flags & MS_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 67aedf4c2e7c..e5bb2de2262a 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -264,7 +264,7 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb) struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); unsigned long delay; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; spin_lock(&sbi->work_lock); @@ -284,7 +284,7 @@ static void hfsplus_put_super(struct super_block *sb) cancel_delayed_work_sync(&sbi->sync_work); - if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { + if (!sb_rdonly(sb) && sbi->s_vhdr) { struct hfsplus_vh *vhdr = sbi->s_vhdr; vhdr->modify_date = hfsp_now2mt(); @@ -329,7 +329,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfsplus_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) return 0; if (!(*flags & MS_RDONLY)) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; @@ -462,7 +462,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) pr_warn("Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && - !(sb->s_flags & MS_RDONLY)) { + !sb_rdonly(sb)) { pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } @@ -535,7 +535,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) } else hfs_find_exit(&fd); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { /* * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused * all three are registered with Apple for our use diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index d6a4b55d2ab0..098bf0f4f386 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -538,7 +538,7 @@ int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned return 0; if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) { hpfs_lock(s); - if (s->s_flags & MS_RDONLY) { + if (sb_rdonly(s)) { err = -EROFS; goto unlock_1; } @@ -559,7 +559,7 @@ unlock_1: end_bmp = (end + 0x3fff) >> 14; while (start_bmp < end_bmp && !err) { hpfs_lock(s); - if (s->s_flags & MS_RDONLY) { + if (sb_rdonly(s)) { err = -EROFS; goto unlock_2; } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 7b9150c2e75c..fa6bbb4f509f 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -264,7 +264,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in hpfs_result = hpfs_i(result); if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; - if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { + if (de->has_acl || de->has_xtd_perm) if (!sb_rdonly(dir->i_sb)) { hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); goto bail1; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 82067ca22f2b..1516fb4e28f4 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -21,7 +21,7 @@ static void mark_dirty(struct super_block *s, int remount) { - if (hpfs_sb(s)->sb_chkdsk && (remount || !(s->s_flags & MS_RDONLY))) { + if (hpfs_sb(s)->sb_chkdsk && (remount || !sb_rdonly(s))) { struct buffer_head *bh; struct hpfs_spare_block *sb; if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { @@ -41,7 +41,7 @@ static void unmark_dirty(struct super_block *s) { struct buffer_head *bh; struct hpfs_spare_block *sb; - if (s->s_flags & MS_RDONLY) return; + if (sb_rdonly(s)) return; sync_blockdev(s->s_bdev); if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error; @@ -73,14 +73,14 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) mark_dirty(s, 0); panic("HPFS panic"); } else if (hpfs_sb(s)->sb_err == 1) { - if (s->s_flags & MS_RDONLY) + if (sb_rdonly(s)) pr_cont("; already mounted read-only\n"); else { pr_cont("; remounting read-only\n"); mark_dirty(s, 0); s->s_flags |= MS_RDONLY; } - } else if (s->s_flags & MS_RDONLY) + } else if (sb_rdonly(s)) pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); else pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); @@ -607,8 +607,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) } /* Check version */ - if (!(s->s_flags & MS_RDONLY) && - superblock->funcversion != 2 && superblock->funcversion != 3) { + if (!sb_rdonly(s) && superblock->funcversion != 2 && superblock->funcversion != 3) { pr_err("Bad version %d,%d. Mount readonly to go around\n", (int)superblock->version, (int)superblock->funcversion); pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); @@ -666,7 +665,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) hpfs_error(s, "improperly stopped"); } - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { spareblock->dirty = 1; spareblock->old_wrote = 0; mark_buffer_dirty(bh2); diff --git a/fs/inode.c b/fs/inode.c index 210054157a49..d1e35b53bb23 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1570,11 +1570,24 @@ EXPORT_SYMBOL(bmap); static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, bool rcu) { - if (!rcu) { - struct inode *realinode = d_real_inode(dentry); + struct dentry *upperdentry; - if (unlikely(inode != realinode) && - (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || + /* + * Nothing to do if in rcu or if non-overlayfs + */ + if (rcu || likely(!(dentry->d_flags & DCACHE_OP_REAL))) + return; + + upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); + + /* + * If file is on lower then we can't update atime, so no worries about + * stale mtime/ctime. + */ + if (upperdentry) { + struct inode *realinode = d_inode(upperdentry); + + if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { inode->i_mtime = realinode->i_mtime; inode->i_ctime = realinode->i_ctime; diff --git a/fs/internal.h b/fs/internal.h index fedfe94d84ba..48cee21b4f14 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -71,8 +71,10 @@ extern void __init mnt_init(void); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); +extern int mnt_want_write_file_path(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); +extern void mnt_drop_write_file_path(struct file *); /* * fs_struct.c diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index f1ed935322db..db692f554158 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -737,7 +737,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) root_found: /* We don't support read-write mounts */ - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { error = -EACCES; goto out_freebh; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 76fa814df3d1..e96c6b05e43e 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -395,14 +395,14 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - if (c->flags & JFFS2_SB_FLAG_RO && !(sb->s_flags & MS_RDONLY)) + if (c->flags & JFFS2_SB_FLAG_RO && !sb_rdonly(sb)) return -EROFS; /* We stop if it was running, then restart if it needs to. This also catches the case where it was stopped and this is just a remount to restart it. Flush the writebuffer, if neccecary, else we loose it */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { jffs2_stop_garbage_collect_thread(c); mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); @@ -590,7 +590,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = JFFS2_SUPER_MAGIC; - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) jffs2_start_garbage_collect_thread(c); return 0; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 5ef21f4c4c77..153f1c6eb169 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -342,7 +342,7 @@ static void jffs2_put_super (struct super_block *sb) static void jffs2_kill_sb(struct super_block *sb) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) jffs2_stop_garbage_collect_thread(c); kill_mtd_super(sb); kfree(c); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 48d9522e209c..2cfe487708e0 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1162,7 +1162,7 @@ static void delayed_wbuf_sync(struct work_struct *work) struct jffs2_sb_info *c = work_to_sb(work); struct super_block *sb = OFNI_BS_2SFFJ(c); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { jffs2_dbg(1, "%s()\n", __func__); jffs2_flush_wbuf_gc(c, 0); } @@ -1173,7 +1173,7 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c) struct super_block *sb = OFNI_BS_2SFFJ(c); unsigned long delay; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; delay = msecs_to_jiffies(dirty_writeback_interval * 10); diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 9895595fd2f2..d8658607bf46 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -362,7 +362,7 @@ static int chkSuper(struct super_block *sb) /* validate fs state */ if (j_sb->s_state != cpu_to_le32(FM_CLEAN) && - !(sb->s_flags & MS_RDONLY)) { + !sb_rdonly(sb)) { jfs_err("jfs_mount: Mount Failure: File System Dirty."); rc = -EINVAL; goto out; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 60726ae7cf26..2f14677169c3 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -76,7 +76,7 @@ static void jfs_handle_error(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; updateSuper(sb, FM_DIRTY); @@ -468,7 +468,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; if (newLVSize) { - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } @@ -477,7 +477,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) return rc; } - if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o @@ -493,7 +493,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) dquot_resume(sb, -1); return ret; } - if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { + if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; @@ -502,7 +502,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) return rc; } if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { rc = jfs_umount_rw(sb); if (rc) return rc; @@ -592,7 +592,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) jfs_err("jfs_mount failed w/return code = %d", rc); goto out_mount_failed; } - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) sbi->log = NULL; else { rc = jfs_mount_rw(sb, 0); @@ -652,7 +652,7 @@ static int jfs_freeze(struct super_block *sb) struct jfs_log *log = sbi->log; int rc = 0; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { txQuiesce(sb); rc = lmLogShutdown(log); if (rc) { @@ -682,7 +682,7 @@ static int jfs_unfreeze(struct super_block *sb) struct jfs_log *log = sbi->log; int rc = 0; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { rc = updateSuper(sb, FM_MOUNT); if (rc) { jfs_error(sb, "updateSuper failed\n"); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 6ac76b0434e9..b6829d679643 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -42,7 +42,7 @@ static void minix_put_super(struct super_block *sb) int i; struct minix_sb_info *sbi = minix_sb(sb); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ sbi->s_ms->s_state = sbi->s_mount_state; mark_buffer_dirty(sbi->s_sbh); @@ -125,7 +125,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) sync_filesystem(sb); ms = sbi->s_ms; - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) return 0; if (*flags & MS_RDONLY) { if (ms->s_state & MINIX_VALID_FS || @@ -293,7 +293,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) if (!s->s_root) goto out_no_root; - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ ms->s_state &= ~MINIX_VALID_FS; mark_buffer_dirty(bh); diff --git a/fs/namei.c b/fs/namei.c index 1180f9c58093..c75ea03ca147 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -447,8 +447,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ - if ((sb->s_flags & MS_RDONLY) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; diff --git a/fs/namespace.c b/fs/namespace.c index f8893dc6a989..54059b142d6b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -275,7 +275,7 @@ int __mnt_is_readonly(struct vfsmount *mnt) { if (mnt->mnt_flags & MNT_READONLY) return 1; - if (mnt->mnt_sb->s_flags & MS_RDONLY) + if (sb_rdonly(mnt->mnt_sb)) return 1; return 0; } @@ -431,13 +431,18 @@ int __mnt_want_write_file(struct file *file) } /** - * mnt_want_write_file - get write access to a file's mount + * mnt_want_write_file_path - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already + * + * Called by the vfs for cases when we have an open file at hand, but will do an + * inode operation on it (important distinction for files opened on overlayfs, + * since the file operations will come from the real underlying file, while + * inode operations come from the overlay). */ -int mnt_want_write_file(struct file *file) +int mnt_want_write_file_path(struct file *file) { int ret; @@ -447,6 +452,53 @@ int mnt_want_write_file(struct file *file) sb_end_write(file->f_path.mnt->mnt_sb); return ret; } + +static inline int may_write_real(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct dentry *upperdentry; + + /* Writable file? */ + if (file->f_mode & FMODE_WRITER) + return 0; + + /* Not overlayfs? */ + if (likely(!(dentry->d_flags & DCACHE_OP_REAL))) + return 0; + + /* File refers to upper, writable layer? */ + upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); + if (upperdentry && file_inode(file) == d_inode(upperdentry)) + return 0; + + /* Lower layer: can't write to real file, sorry... */ + return -EPERM; +} + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + * + * Mostly called by filesystems from their ioctl operation before performing + * modification. On overlayfs this needs to check if the file is on a read-only + * lower layer and deny access in that case. + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + ret = may_write_real(file); + if (!ret) { + sb_start_write(file_inode(file)->i_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file_inode(file)->i_sb); + } + return ret; +} EXPORT_SYMBOL_GPL(mnt_want_write_file); /** @@ -484,10 +536,16 @@ void __mnt_drop_write_file(struct file *file) __mnt_drop_write(file->f_path.mnt); } -void mnt_drop_write_file(struct file *file) +void mnt_drop_write_file_path(struct file *file) { mnt_drop_write(file->f_path.mnt); } + +void mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); + sb_end_write(file_inode(file)->i_sb); +} EXPORT_SYMBOL(mnt_drop_write_file); static int mnt_make_readonly(struct mount *mnt) @@ -971,7 +1029,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (!mnt) return ERR_PTR(-ENOMEM); - if (flags & MS_KERNMOUNT) + if (flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; root = mount_fs(type, flags, name, data); @@ -1003,7 +1061,7 @@ vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, if (mountpoint->d_sb->s_user_ns != &init_user_ns) return ERR_PTR(-EPERM); - return vfs_kern_mount(type, MS_SUBMOUNT, name, data); + return vfs_kern_mount(type, SB_SUBMOUNT, name, data); } EXPORT_SYMBOL_GPL(vfs_submount); @@ -1124,12 +1182,10 @@ static LLIST_HEAD(delayed_mntput_list); static void delayed_mntput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_mntput_list); - struct llist_node *next; + struct mount *m, *t; - for (; node; node = next) { - next = llist_next(node); - cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); - } + llist_for_each_entry_safe(m, t, node, mnt_llist) + cleanup_mnt(m); } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); @@ -1534,8 +1590,8 @@ static int do_umount(struct mount *mnt, int flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; down_write(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY)) - retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); + if (!sb_rdonly(sb)) + retval = do_remount_sb(sb, SB_RDONLY, NULL, 0); up_write(&sb->s_umount); return retval; } @@ -2059,7 +2115,7 @@ static void unlock_mount(struct mountpoint *where) static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { - if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) + if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; if (d_is_dir(mp->m_dentry) != @@ -2073,9 +2129,9 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) * Sanity check the flags to change_mnt_propagation. */ -static int flags_to_propagation_type(int flags) +static int flags_to_propagation_type(int ms_flags) { - int type = flags & ~(MS_REC | MS_SILENT); + int type = ms_flags & ~(MS_REC | MS_SILENT); /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) @@ -2089,18 +2145,18 @@ static int flags_to_propagation_type(int flags) /* * recursively change the type of the mountpoint. */ -static int do_change_type(struct path *path, int flag) +static int do_change_type(struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); - int recurse = flag & MS_REC; + int recurse = ms_flags & MS_REC; int type; int err = 0; if (path->dentry != path->mnt->mnt_root) return -EINVAL; - type = flags_to_propagation_type(flag); + type = flags_to_propagation_type(ms_flags); if (!type) return -EINVAL; @@ -2222,8 +2278,8 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int flags, int mnt_flags, - void *data) +static int do_remount(struct path *path, int ms_flags, int sb_flags, + int mnt_flags, void *data) { int err; struct super_block *sb = path->mnt->mnt_sb; @@ -2267,12 +2323,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags, return err; down_write(&sb->s_umount); - if (flags & MS_BIND) - err = change_mount_flags(path->mnt, flags); + if (ms_flags & MS_BIND) + err = change_mount_flags(path->mnt, ms_flags); else if (!capable(CAP_SYS_ADMIN)) err = -EPERM; else - err = do_remount_sb(sb, flags, data, 0); + err = do_remount_sb(sb, sb_flags, data, 0); if (!err) { lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; @@ -2437,7 +2493,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *fstype, int flags, +static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; @@ -2451,7 +2507,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (!type) return -ENODEV; - mnt = vfs_kern_mount(type, flags, name, data); + mnt = vfs_kern_mount(type, sb_flags, name, data); if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && !mnt->mnt_sb->s_subtype) mnt = fs_set_subtype(mnt, fstype); @@ -2706,8 +2762,8 @@ long do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; + unsigned int mnt_flags = 0, sb_flags; int retval = 0; - int mnt_flags = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) @@ -2717,6 +2773,9 @@ long do_mount(const char *dev_name, const char __user *dir_name, if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; + if (flags & MS_NOUSER) + return -EINVAL; + /* ... and get the mountpoint */ retval = user_path(dir_name, &path); if (retval) @@ -2726,7 +2785,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, type_page, flags, data_page); if (!retval && !may_mount()) retval = -EPERM; - if (!retval && (flags & MS_MANDLOCK) && !may_mandlock()) + if (!retval && (flags & SB_MANDLOCK) && !may_mandlock()) retval = -EPERM; if (retval) goto dput_out; @@ -2748,7 +2807,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); - if (flags & MS_RDONLY) + if (flags & SB_RDONLY) mnt_flags |= MNT_READONLY; /* The default atime for remount is preservation */ @@ -2759,12 +2818,15 @@ long do_mount(const char *dev_name, const char __user *dir_name, mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; } - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT); + sb_flags = flags & (SB_RDONLY | + SB_SYNCHRONOUS | + SB_MANDLOCK | + SB_DIRSYNC | + SB_SILENT | + SB_POSIXACL); if (flags & MS_REMOUNT) - retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, + retval = do_remount(&path, flags, sb_flags, mnt_flags, data_page); else if (flags & MS_BIND) retval = do_loopback(&path, dev_name, flags & MS_REC); @@ -2773,7 +2835,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, else if (flags & MS_MOVE) retval = do_move_mount(&path, dev_name); else - retval = do_new_mount(&path, type_page, flags, mnt_flags, + retval = do_new_mount(&path, type_page, sb_flags, mnt_flags, dev_name, data_page); dput_out: path_put(&path); @@ -3223,7 +3285,7 @@ void put_mnt_ns(struct mnt_namespace *ns) struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) { struct vfsmount *mnt; - mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until @@ -3300,7 +3362,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, mnt_flags = mnt->mnt.mnt_flags; /* Don't miss readonly hidden in the superblock flags */ - if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY) + if (sb_rdonly(mnt->mnt.mnt_sb)) mnt_flags |= MNT_LOCK_READONLY; /* Verify the mount flags are equal to or more permissive diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a385d1c3f146..0214dd1e1060 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap); * fall back to doing a synchronous write. */ static int -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) +nfs_file_fsync_commit(struct file *file, int datasync) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = file_inode(file); - int have_error, do_resend, status; + int do_resend, status; int ret = 0; dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); - have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) { + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) { ret = xchg(&ctx->error, 0); if (ret) goto out; @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); do { + struct nfs_open_context *ctx = nfs_file_open_context(file); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) { + int ret2 = xchg(&ctx->error, 0); + if (ret2) + ret = ret2; + } if (ret != 0) break; - ret = nfs_file_fsync_commit(file, start, end, datasync); + ret = nfs_file_fsync_commit(file, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 68cc22083639..5bdf952f414b 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -768,3 +768,10 @@ static inline bool nfs_error_is_fatal(int err) return false; } } + +static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) +{ + ctx->error = error; + smp_wmb(); + set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); +} diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 2ca9167bc97d..551711042ba4 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -719,6 +719,254 @@ TRACE_EVENT(nfs_sillyrename_unlink, __get_str(name) ) ); + +TRACE_EVENT(nfs_initiate_read, + TP_PROTO( + const struct inode *inode, + loff_t offset, unsigned long count + ), + + TP_ARGS(inode, offset, count), + + TP_STRUCT__entry( + __field(loff_t, offset) + __field(unsigned long, count) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->offset = offset; + __entry->count = count; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count + ) +); + +TRACE_EVENT(nfs_readpage_done, + TP_PROTO( + const struct inode *inode, + int status, loff_t offset, bool eof + ), + + TP_ARGS(inode, status, offset, eof), + + TP_STRUCT__entry( + __field(int, status) + __field(loff_t, offset) + __field(bool, eof) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->status = status; + __entry->offset = offset; + __entry->eof = eof; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld status=%d%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->status, + __entry->eof ? " eof" : "" + ) +); + +/* + * XXX: I tried using NFS_UNSTABLE and friends in this table, but they + * all evaluate to 0 for some reason, even if I include linux/nfs.h. + */ +#define nfs_show_stable(stable) \ + __print_symbolic(stable, \ + { 0, " (UNSTABLE)" }, \ + { 1, " (DATA_SYNC)" }, \ + { 2, " (FILE_SYNC)" }) + +TRACE_EVENT(nfs_initiate_write, + TP_PROTO( + const struct inode *inode, + loff_t offset, unsigned long count, + enum nfs3_stable_how stable + ), + + TP_ARGS(inode, offset, count, stable), + + TP_STRUCT__entry( + __field(loff_t, offset) + __field(unsigned long, count) + __field(enum nfs3_stable_how, stable) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->offset = offset; + __entry->count = count; + __entry->stable = stable; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%lu stable=%d%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count, + __entry->stable, nfs_show_stable(__entry->stable) + ) +); + +TRACE_EVENT(nfs_writeback_done, + TP_PROTO( + const struct inode *inode, + int status, + loff_t offset, + struct nfs_writeverf *writeverf + ), + + TP_ARGS(inode, status, offset, writeverf), + + TP_STRUCT__entry( + __field(int, status) + __field(loff_t, offset) + __field(enum nfs3_stable_how, stable) + __field(unsigned long long, verifier) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->status = status; + __entry->offset = offset; + __entry->stable = writeverf->committed; + memcpy(&__entry->verifier, &writeverf->verifier, + sizeof(__entry->verifier)); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld status=%d stable=%d%s " + "verifier 0x%016llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->status, + __entry->stable, nfs_show_stable(__entry->stable), + __entry->verifier + ) +); + +TRACE_EVENT(nfs_initiate_commit, + TP_PROTO( + const struct nfs_commit_data *data + ), + + TP_ARGS(data), + + TP_STRUCT__entry( + __field(loff_t, offset) + __field(unsigned long, count) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count + ) +); + +TRACE_EVENT(nfs_commit_done, + TP_PROTO( + const struct nfs_commit_data *data + ), + + TP_ARGS(data), + + TP_STRUCT__entry( + __field(int, status) + __field(loff_t, offset) + __field(unsigned long long, verifier) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->status = data->res.op_status; + __entry->offset = data->args.offset; + memcpy(&__entry->verifier, &data->verf.verifier, + sizeof(__entry->verifier)); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld status=%d verifier 0x%016llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->status, + __entry->verifier + ) +); + #endif /* _TRACE_NFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index bec120ec1967..d0543e19098a 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1170,8 +1170,8 @@ out_failed: /* remember fatal errors */ if (nfs_error_is_fatal(desc->pg_error)) - mapping_set_error(desc->pg_inode->i_mapping, - desc->pg_error); + nfs_context_set_write_error(req->wb_context, + desc->pg_error); func = desc->pg_completion_ops->error_cleanup; for (midx = 0; midx < desc->pg_mirror_count; midx++) { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7879ed8ceb76..3bcd669a3152 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1664,7 +1664,7 @@ pnfs_update_layout(struct inode *ino, .offset = pos, .length = count, }; - unsigned pg_offset, seq; + unsigned pg_offset; struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo = NULL; @@ -1754,10 +1754,14 @@ lookup_again: } first = true; - do { - seq = read_seqbegin(&ctx->state->seqlock); - nfs4_stateid_copy(&stateid, &ctx->state->stateid); - } while (read_seqretry(&ctx->state->seqlock, seq)); + if (nfs4_select_rw_stateid(ctx->state, + iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, + NULL, &stateid, NULL) != 0) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + goto out_unlock; + } } else { nfs4_stateid_copy(&stateid, &lo->plh_stateid); } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0d42573d423d..48d7277c60a9 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -25,6 +25,7 @@ #include "iostat.h" #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -200,6 +201,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, task_setup_data->flags |= swap_flags; rpc_ops->read_setup(hdr, msg); + trace_nfs_initiate_read(inode, hdr->io_start, hdr->good_bytes); } static void @@ -232,6 +234,8 @@ static int nfs_readpage_done(struct rpc_task *task, return status; nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count); + trace_nfs_readpage_done(inode, task->tk_status, + hdr->args.offset, hdr->res.eof); if (task->tk_status == -ESTALE) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6b179af59b92..c9d24bae3025 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -812,7 +812,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) * Display all mount option settings */ seq_printf(m, "\n\topts:\t"); - seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw"); seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f68083db63c8..babebbccae2a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -145,13 +145,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) -{ - ctx->error = error; - smp_wmb(); - set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); -} - static struct nfs_page * nfs_page_private_request(struct page *page) { @@ -1383,6 +1376,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, task_setup_data->priority = priority; rpc_ops->write_setup(hdr, msg); + trace_nfs_initiate_write(hdr->inode, hdr->io_start, hdr->good_bytes, + hdr->args.stable); nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client, &task_setup_data->rpc_client, msg, hdr); @@ -1540,7 +1535,10 @@ static int nfs_writeback_done(struct rpc_task *task, status = NFS_PROTO(inode)->write_done(task, hdr); if (status != 0) return status; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); + trace_nfs_writeback_done(inode, task->tk_status, + hdr->args.offset, hdr->res.verf); if (hdr->res.verf->committed < hdr->args.stable && task->tk_status >= 0) { @@ -1669,6 +1667,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, }; /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg); + trace_nfs_initiate_commit(data); dprintk("NFS: initiated commit call\n"); @@ -1793,6 +1792,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) /* Call the NFS version-specific code */ NFS_PROTO(data->inode)->commit_done(task, data); + trace_nfs_commit_done(data); } static void nfs_commit_release_pages(struct nfs_commit_data *data) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7ffe71a8dfb9..6a612d832e7d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -174,7 +174,7 @@ static int nilfs_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int err = 0; - if (inode->i_sb->s_flags & MS_RDONLY) { + if (sb_rdonly(inode->i_sb)) { nilfs_clear_dirty_pages(mapping, false); return -EROFS; } @@ -191,7 +191,7 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; int err; - if (inode->i_sb->s_flags & MS_RDONLY) { + if (sb_rdonly(inode->i_sb)) { /* * It means that filesystem was remounted in read-only * mode because of error or metadata corruption. But we diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 98835ed6bef4..c6bc1033e7d2 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -413,7 +413,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) struct super_block *sb; int err = 0; - if (inode && (inode->i_sb->s_flags & MS_RDONLY)) { + if (inode && sb_rdonly(inode->i_sb)) { /* * It means that filesystem was remounted in read-only * mode because of error or metadata corruption. But we diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 926682981d61..4fc018dfcfae 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -136,7 +136,7 @@ void __nilfs_error(struct super_block *sb, const char *function, va_end(args); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { nilfs_set_error(sb); if (nilfs_test_opt(nilfs, ERRORS_RO)) { @@ -478,7 +478,7 @@ static void nilfs_put_super(struct super_block *sb) nilfs_detach_log_writer(sb); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { down_write(&nilfs->ns_sem); nilfs_cleanup_super(sb); up_write(&nilfs->ns_sem); @@ -578,7 +578,7 @@ static int nilfs_freeze(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; int err; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; /* Mark super block clean */ @@ -592,7 +592,7 @@ static int nilfs_unfreeze(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; down_write(&nilfs->ns_sem); @@ -898,7 +898,7 @@ int nilfs_check_feature_compatibility(struct super_block *sb, } features = le64_to_cpu(sbp->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; - if (!(sb->s_flags & MS_RDONLY) && features) { + if (!sb_rdonly(sb) && features) { nilfs_msg(sb, KERN_ERR, "couldn't mount RDWR because of unsupported optional features (%llx)", (unsigned long long)features); @@ -1083,7 +1083,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_unload; } - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { err = nilfs_attach_log_writer(sb, fsroot); if (err) goto failed_checkpoint; @@ -1095,7 +1095,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) nilfs_put_root(fsroot); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { down_write(&nilfs->ns_sem); nilfs_setup_super(sb, true); up_write(&nilfs->ns_sem); @@ -1144,7 +1144,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) goto out; if (*flags & MS_RDONLY) { /* Shutting down log writer */ @@ -1338,8 +1338,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, if ((flags ^ s->s_flags) & MS_RDONLY) { nilfs_msg(s, KERN_ERR, "the device already has a %s mount.", - (s->s_flags & MS_RDONLY) ? - "read-only" : "read/write"); + sb_rdonly(s) ? "read-only" : "read/write"); err = -EBUSY; goto failed_super; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index ecb49870a680..3f70f041dbe9 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -487,7 +487,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) * When remounting read-only, mark the volume clean if no volume errors * have occurred. */ - if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) { static const char *es = ". Cannot remount read-write."; /* Remounting read-write. */ @@ -548,7 +548,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) NVolSetErrors(vol); return -EROFS; } - } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { + } else if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) { /* Remounting read-only. */ if (!NVolErrors(vol)) { if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) @@ -732,7 +732,7 @@ hotfix_primary_boot_sector: * on a large sector device contains the whole boot loader or * just the first 512 bytes). */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { ntfs_warning(sb, "Hot-fix: Recovering invalid primary " "boot sector from backup copy."); memcpy(bh_primary->b_data, bh_backup->b_data, @@ -1789,7 +1789,7 @@ static bool load_system_files(ntfs_volume *vol) static const char *es3 = ". Run ntfsfix and/or chkdsk."; /* If a read-write mount, convert it to a read-only mount. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | ON_ERRORS_CONTINUE))) { ntfs_error(sb, "%s and neither on_errors=" @@ -1928,7 +1928,7 @@ get_ctx_vol_failed: (unsigned)le16_to_cpu(vol->vol_flags)); } /* If a read-write mount, convert it to a read-only mount. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | ON_ERRORS_CONTINUE))) { ntfs_error(sb, "%s and neither on_errors=" @@ -1961,7 +1961,7 @@ get_ctx_vol_failed: es1 = !vol->logfile_ino ? es1a : es1b; /* If a read-write mount, convert it to a read-only mount. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | ON_ERRORS_CONTINUE))) { ntfs_error(sb, "%s and neither on_errors=" @@ -2010,7 +2010,7 @@ get_ctx_vol_failed: es1 = err < 0 ? es1a : es1b; /* If a read-write mount, convert it to a read-only mount. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | ON_ERRORS_CONTINUE))) { ntfs_error(sb, "%s and neither on_errors=" @@ -2028,8 +2028,7 @@ get_ctx_vol_failed: NVolSetErrors(vol); } /* If (still) a read-write mount, mark the volume dirty. */ - if (!(sb->s_flags & MS_RDONLY) && - ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { static const char *es1 = "Failed to set dirty bit in volume " "information flags"; static const char *es2 = ". Run chkdsk."; @@ -2075,8 +2074,7 @@ get_ctx_vol_failed: } #endif /* If (still) a read-write mount, empty the logfile. */ - if (!(sb->s_flags & MS_RDONLY) && - !ntfs_empty_logfile(vol->logfile_ino)) { + if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) { static const char *es1 = "Failed to empty $LogFile"; static const char *es2 = ". Mount in Windows."; @@ -2121,7 +2119,7 @@ get_ctx_vol_failed: static const char *es2 = ". Run chkdsk."; /* If a read-write mount, convert it to a read-only mount. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | ON_ERRORS_CONTINUE))) { ntfs_error(sb, "%s and neither on_errors=" @@ -2139,8 +2137,7 @@ get_ctx_vol_failed: NVolSetErrors(vol); } /* If (still) a read-write mount, mark the quotas out of date. */ - if (!(sb->s_flags & MS_RDONLY) && - !ntfs_mark_quotas_out_of_date(vol)) { + if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) { static const char *es1 = "Failed to mark quotas out of date"; static const char *es2 = ". Run chkdsk."; @@ -2165,7 +2162,7 @@ get_ctx_vol_failed: static const char *es2 = ". Run chkdsk."; /* If a read-write mount, convert it to a read-only mount. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | ON_ERRORS_CONTINUE))) { ntfs_error(sb, "%s and neither on_errors=" @@ -2183,7 +2180,7 @@ get_ctx_vol_failed: NVolSetErrors(vol); } /* If (still) a read-write mount, stamp the transaction log. */ - if (!(sb->s_flags & MS_RDONLY) && !ntfs_stamp_usnjrnl(vol)) { + if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) { static const char *es1 = "Failed to stamp transaction log " "($UsnJrnl)"; static const char *es2 = ". Run chkdsk."; @@ -2314,7 +2311,7 @@ static void ntfs_put_super(struct super_block *sb) * If a read-write mount and no volume errors have occurred, mark the * volume clean. Also, re-commit all affected inodes. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!NVolErrors(vol)) { if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) ntfs_warning(sb, "Failed to clear dirty bit " diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3f936be379a9..80733496b22a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -675,7 +675,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) } /* We're going to/from readonly mode. */ - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) { /* Disable quota accounting before remounting RO */ if (*flags & MS_RDONLY) { ret = ocfs2_susp_quotas(osb, 0); @@ -1063,7 +1063,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, * heartbeat=none */ if (bdev_read_only(sb->s_bdev)) { - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { status = -EACCES; mlog(ML_ERROR, "Readonly device detected but readonly " "mount was not specified.\n"); @@ -1098,7 +1098,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } if (!ocfs2_is_hard_readonly(osb)) { - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) ocfs2_set_ro_flag(osb, 0); } @@ -1179,7 +1179,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Now we can initialize quotas because we can afford to wait * for cluster locks recovery now. That also means that truncation * log recovery can happen but that waits for proper quota setup */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { status = ocfs2_enable_quotas(osb); if (status < 0) { /* We have to err-out specially here because @@ -2180,8 +2180,7 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } - if (!(osb->sb->s_flags & MS_RDONLY) && - (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { + if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount RDWR because of " "unsupported optional features (%x).\n", i); status = -EINVAL; @@ -2567,9 +2566,7 @@ static int ocfs2_handle_error(struct super_block *sb) rv = -EIO; } else { /* default option */ rv = -EROFS; - if (sb->s_flags & MS_RDONLY && - (ocfs2_is_soft_readonly(osb) || - ocfs2_is_hard_readonly(osb))) + if (sb_rdonly(sb) && (ocfs2_is_soft_readonly(osb) || ocfs2_is_hard_readonly(osb))) return rv; pr_crit("OCFS2: File system is now read-only.\n"); diff --git a/fs/open.c b/fs/open.c index 35bb784763a4..7ea118471dce 100644 --- a/fs/open.c +++ b/fs/open.c @@ -96,7 +96,7 @@ long vfs_truncate(const struct path *path, loff_t length) * write access on the upper inode, not on the overlay inode. For * non-overlay filesystems d_real() is an identity function. */ - upperdentry = d_real(path->dentry, NULL, O_WRONLY); + upperdentry = d_real(path->dentry, NULL, O_WRONLY, 0); error = PTR_ERR(upperdentry); if (IS_ERR(upperdentry)) goto mnt_drop_write_and_out; @@ -670,12 +670,12 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) if (!f.file) goto out; - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (error) goto out_fput; audit_file(f.file); error = chown_common(&f.file->f_path, user, group); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); out_fput: fdput(f); out: @@ -857,7 +857,7 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags); + struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 7a3754488312..9108ef433e6d 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -35,7 +35,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) * I don't do that for now. */ value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL); - if (value == NULL) + if (!value) return ERR_PTR(-ENOMEM); gossip_debug(GOSSIP_ACL_DEBUG, @@ -61,9 +61,9 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) return acl; } -int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); int error = 0; void *value = NULL; size_t size = 0; @@ -72,22 +72,6 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - umode_t mode; - - error = posix_acl_update_mode(inode, &mode, &acl); - if (error) { - gossip_err("%s: posix_acl_update_mode err: %d\n", - __func__, - error); - return error; - } - - if (inode->i_mode != mode) - SetModeFlag(orangefs_inode); - inode->i_mode = mode; - mark_inode_dirty_sync(inode); - } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; @@ -132,6 +116,42 @@ out: return error; } +int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + struct iattr iattr; + int rc; + + if (type == ACL_TYPE_ACCESS && acl) { + /* + * posix_acl_update_mode checks to see if the permissions + * described by the ACL can be encoded into the + * object's mode. If so, it sets "acl" to NULL + * and "mode" to the new desired value. It is up to + * us to propagate the new mode back to the server... + */ + error = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + if (error) { + gossip_err("%s: posix_acl_update_mode err: %d\n", + __func__, + error); + return error; + } + + if (acl) { + rc = __orangefs_set_acl(inode, acl, type); + } else { + iattr.ia_valid = ATTR_MODE; + rc = orangefs_inode_setattr(inode, &iattr); + } + + return rc; + + } else { + return -EINVAL; + } +} + int orangefs_init_acl(struct inode *inode, struct inode *dir) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); @@ -146,13 +166,14 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir) return error; if (default_acl) { - error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __orangefs_set_acl(inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index c19f0787c9c6..2826859bdc2c 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -461,13 +461,10 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, if (op->downcall.type != ORANGEFS_VFS_OP_READDIR) goto wakeup; - op->downcall.trailer_buf = - vmalloc(op->downcall.trailer_size); - if (op->downcall.trailer_buf == NULL) { - gossip_err("%s: failed trailer vmalloc.\n", - __func__); + op->downcall.trailer_buf = vmalloc(op->downcall.trailer_size); + if (!op->downcall.trailer_buf) goto Enomem; - } + memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size); if (!copy_from_iter_full(op->downcall.trailer_buf, op->downcall.trailer_size, iter)) { diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 28f38d813ad2..336ecbf8c268 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -646,14 +646,11 @@ static int orangefs_fsync(struct file *file, loff_t end, int datasync) { - int ret = -EINVAL; + int ret; struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(file_inode(file)); struct orangefs_kernel_op_s *new_op = NULL; - /* required call */ - filemap_write_and_wait_range(file->f_mapping, start, end); - new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); if (!new_op) return -ENOMEM; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 038d67545d9f..7ef473f3d642 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -244,20 +244,14 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) bufmap->buffer_index_array = kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL); - if (!bufmap->buffer_index_array) { - gossip_err("orangefs: could not allocate %d buffer indices\n", - bufmap->desc_count); + if (!bufmap->buffer_index_array) goto out_free_bufmap; - } bufmap->desc_array = kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc), GFP_KERNEL); - if (!bufmap->desc_array) { - gossip_err("orangefs: could not allocate %d descriptors\n", - bufmap->desc_count); + if (!bufmap->desc_array) goto out_free_index_array; - } bufmap->page_count = bufmap->total_size / PAGE_SIZE; diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 716ed337f166..5f59917fd631 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -571,11 +571,8 @@ static int orangefs_prepare_cdm_array(char *debug_array_string) goto out; } - cdm_array = - kzalloc(cdm_element_count * sizeof(struct client_debug_mask), - GFP_KERNEL); + cdm_array = kcalloc(cdm_element_count, sizeof(*cdm_array), GFP_KERNEL); if (!cdm_array) { - pr_info("malloc failed for cdm_array!\n"); rc = -ENOMEM; goto out; } diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index c1b5174cb5a9..85ef87245a87 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -98,7 +98,6 @@ static int __init orangefs_init(void) orangefs_htable_ops_in_progress = kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL); if (!orangefs_htable_ops_in_progress) { - gossip_err("Failed to initialize op hashtable"); ret = -ENOMEM; goto cleanup_inode; } diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 5a1bed6c8c6a..47f3fb9cbec4 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -107,10 +107,8 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) struct orangefs_inode_s *orangefs_inode; orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL); - if (orangefs_inode == NULL) { - gossip_err("Failed to allocate orangefs_inode\n"); + if (!orangefs_inode) return NULL; - } /* * We want to clear everything except for rw_semaphore and the diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 237c9c04dc3b..81ac88bb91ff 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -76,7 +76,7 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; fsuid = from_kuid(&init_user_ns, current_fsuid()); @@ -169,7 +169,7 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, struct orangefs_kernel_op_s *new_op = NULL; int ret = -ENOMEM; - if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; down_write(&orangefs_inode->xattr_sem); @@ -233,13 +233,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, if (size > ORANGEFS_MAX_XATTR_VALUELEN) return -EINVAL; - if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; internal_flag = convert_to_internal_xattr_flags(flags); /* This is equivalent to a removexattr */ - if (size == 0 && value == NULL) { + if (size == 0 && !value) { gossip_debug(GOSSIP_XATTR_DEBUG, "removing xattr (%s)\n", name); @@ -311,7 +311,7 @@ ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size) int i = 0; int returned_count = 0; - if (size > 0 && buffer == NULL) { + if (size > 0 && !buffer) { gossip_err("%s: bogus NULL pointers\n", __func__); return -EINVAL; } @@ -442,7 +442,7 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler, } -static struct xattr_handler orangefs_xattr_default_handler = { +static const struct xattr_handler orangefs_xattr_default_handler = { .prefix = "", /* match any name => handlers called with full name */ .get = orangefs_xattr_get_default, .set = orangefs_xattr_set_default, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index acb6f97deb97..aad97b30d5e6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -241,7 +241,7 @@ struct ovl_fh *ovl_encode_fh(struct dentry *lower, bool is_upper) int buflen = MAX_HANDLE_SZ; uuid_t *uuid = &lower->d_sb->s_uuid; - buf = kmalloc(buflen, GFP_TEMPORARY); + buf = kmalloc(buflen, GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 48b70e6490f3..3309b1912241 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -155,7 +155,7 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) static void ovl_instantiate(struct dentry *dentry, struct inode *inode, struct dentry *newdentry, bool hardlink) { - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); if (!hardlink) { ovl_inode_update(inode, newdentry); @@ -692,7 +692,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) if (flags) ovl_cleanup(wdir, upper); - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, true); out_d_drop: d_drop(dentry); dput(whiteout); @@ -742,7 +742,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) err = vfs_rmdir(dir, upper); else err = vfs_unlink(dir, upper, NULL); - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, ovl_type_origin(dentry)); /* * Keeping this dentry hashed would mean having to release @@ -833,7 +833,7 @@ static char *ovl_get_redirect(struct dentry *dentry, bool samedir) goto out; } - buf = ret = kmalloc(buflen, GFP_TEMPORARY); + buf = ret = kmalloc(buflen, GFP_KERNEL); if (!buf) goto out; @@ -1089,8 +1089,9 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, drop_nlink(d_inode(new)); } - ovl_dentry_version_inc(old->d_parent); - ovl_dentry_version_inc(new->d_parent); + ovl_dentry_version_inc(old->d_parent, + !overwrite && ovl_type_origin(new)); + ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old)); out_dput: dput(newdentry); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 5bc71642b226..a619addecafc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -498,6 +498,9 @@ static int ovl_set_nlink_common(struct dentry *dentry, len = snprintf(buf, sizeof(buf), format, (int) (inode->i_nlink - realinode->i_nlink)); + if (WARN_ON(len >= sizeof(buf))) + return -EIO; + return ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_NLINK, buf, len, 0); } @@ -576,10 +579,13 @@ static int ovl_inode_set(struct inode *inode, void *data) static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, struct dentry *upperdentry) { - struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL; - - /* Lower (origin) inode must match, even if NULL */ - if (ovl_inode_lower(inode) != lowerinode) + /* + * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. + * This happens when finding a copied up overlay inode for a renamed + * or hardlinked overlay dentry and lower dentry cannot be followed + * by origin because lower fs does not support file handles. + */ + if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) return false; /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 8aef2b304b2d..c3addd1114f1 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -38,7 +38,7 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, return 0; goto fail; } - buf = kzalloc(prelen + res + strlen(post) + 1, GFP_TEMPORARY); + buf = kzalloc(prelen + res + strlen(post) + 1, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -103,7 +103,7 @@ static struct ovl_fh *ovl_get_origin_fh(struct dentry *dentry) if (res == 0) return NULL; - fh = kzalloc(res, GFP_TEMPORARY); + fh = kzalloc(res, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); @@ -309,7 +309,7 @@ static int ovl_check_origin(struct dentry *upperdentry, BUG_ON(*ctrp); if (!*stackp) - *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY); + *stackp = kmalloc(sizeof(struct path), GFP_KERNEL); if (!*stackp) { dput(origin); return -ENOMEM; @@ -418,7 +418,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, err = -ENOMEM; len = index->d_name.len / 2; - fh = kzalloc(len, GFP_TEMPORARY); + fh = kzalloc(len, GFP_KERNEL); if (!fh) goto fail; @@ -478,7 +478,7 @@ int ovl_get_index_name(struct dentry *origin, struct qstr *name) return PTR_ERR(fh); err = -ENOMEM; - n = kzalloc(fh->len * 2, GFP_TEMPORARY); + n = kzalloc(fh->len * 2, GFP_KERNEL); if (n) { s = bin2hex(n, fh, fh->len); *name = (struct qstr) QSTR_INIT(n, s - n); @@ -646,7 +646,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!d.stop && poe->numlower) { err = -ENOMEM; stack = kcalloc(ofs->numlower, sizeof(struct path), - GFP_TEMPORARY); + GFP_KERNEL); if (!stack) goto out_put_upper; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e927a62c97ae..d4e8c1a08fb0 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -204,8 +204,8 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); -struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); -void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); @@ -217,7 +217,7 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, struct dentry *lowerdentry); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); -void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); @@ -229,6 +229,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); void ovl_set_flag(unsigned long flag, struct inode *inode); +void ovl_clear_flag(unsigned long flag, struct inode *inode); bool ovl_test_flag(unsigned long flag, struct inode *inode); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); @@ -256,6 +257,7 @@ extern const struct file_operations ovl_dir_operations; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); +void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(struct path *realpath); void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index f0fd3adb1693..62e9b22a2077 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -15,11 +15,13 @@ #include <linux/rbtree.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/ratelimit.h> #include "overlayfs.h" struct ovl_cache_entry { unsigned int len; unsigned int type; + u64 real_ino; u64 ino; struct list_head l_node; struct rb_node node; @@ -32,18 +34,20 @@ struct ovl_dir_cache { long refcount; u64 version; struct list_head entries; + struct rb_root root; }; struct ovl_readdir_data { struct dir_context ctx; struct dentry *dentry; bool is_lowest; - struct rb_root root; + struct rb_root *root; struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; int count; int err; + bool is_upper; bool d_type_supported; }; @@ -58,7 +62,33 @@ struct ovl_dir_file { static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) { - return container_of(n, struct ovl_cache_entry, node); + return rb_entry(n, struct ovl_cache_entry, node); +} + +static bool ovl_cache_entry_find_link(const char *name, int len, + struct rb_node ***link, + struct rb_node **parent) +{ + bool found = false; + struct rb_node **newp = *link; + + while (!found && *newp) { + int cmp; + struct ovl_cache_entry *tmp; + + *parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + found = true; + } + *link = newp; + + return found; } static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, @@ -82,6 +112,32 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } +static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, + struct ovl_cache_entry *p) +{ + /* Don't care if not doing ovl_iter() */ + if (!rdd->dentry) + return false; + + /* Always recalc d_ino for parent */ + if (strcmp(p->name, "..") == 0) + return true; + + /* If this is lower, then native d_ino will do */ + if (!rdd->is_upper) + return false; + + /* + * Recalc d_ino for '.' and for all entries if dir is impure (contains + * copied up entries) + */ + if ((p->name[0] == '.' && p->len == 1) || + ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) + return true; + + return false; +} + static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) @@ -97,7 +153,11 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, p->name[len] = '\0'; p->len = len; p->type = d_type; + p->real_ino = ino; p->ino = ino; + /* Defer setting d_ino for upper entry to ovl_iterate() */ + if (ovl_calc_d_ino(rdd, p)) + p->ino = 0; p->is_whiteout = false; if (d_type == DT_CHR) { @@ -111,32 +171,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { - struct rb_node **newp = &rdd->root.rb_node; + struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; - while (*newp) { - int cmp; - struct ovl_cache_entry *tmp; - - parent = *newp; - tmp = ovl_cache_entry_from_node(*newp); - cmp = strncmp(name, tmp->name, len); - if (cmp > 0) - newp = &tmp->node.rb_right; - else if (cmp < 0 || len < tmp->len) - newp = &tmp->node.rb_left; - else - return 0; - } + if (ovl_cache_entry_find_link(name, len, &newp, &parent)) + return 0; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); - if (p == NULL) + if (p == NULL) { + rdd->err = -ENOMEM; return -ENOMEM; + } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); - rb_insert_color(&p->node, &rdd->root); + rb_insert_color(&p->node, rdd->root); return 0; } @@ -147,7 +197,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, { struct ovl_cache_entry *p; - p = ovl_cache_entry_find(&rdd->root, name, namelen); + p = ovl_cache_entry_find(rdd->root, name, namelen); if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { @@ -172,6 +222,16 @@ void ovl_cache_free(struct list_head *list) INIT_LIST_HEAD(list); } +void ovl_dir_cache_free(struct inode *inode) +{ + struct ovl_dir_cache *cache = ovl_dir_cache(inode); + + if (cache) { + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; @@ -179,8 +239,8 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { - if (ovl_dir_cache(dentry) == cache) - ovl_set_dir_cache(dentry, NULL); + if (ovl_dir_cache(d_inode(dentry)) == cache) + ovl_set_dir_cache(d_inode(dentry), NULL); ovl_cache_free(&cache->entries); kfree(cache); @@ -273,7 +333,8 @@ static void ovl_dir_reset(struct file *file) od->is_real = false; } -static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, + struct rb_root *root) { int err; struct path realpath; @@ -281,13 +342,14 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) .ctx.actor = ovl_fill_merge, .dentry = dentry, .list = list, - .root = RB_ROOT, + .root = root, .is_lowest = false, }; int idx, next; for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath); + rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; if (next != -1) { err = ovl_dir_read(&realpath, &rdd); @@ -326,12 +388,13 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) int res; struct ovl_dir_cache *cache; - cache = ovl_dir_cache(dentry); + cache = ovl_dir_cache(d_inode(dentry)); if (cache && ovl_dentry_version_get(dentry) == cache->version) { + WARN_ON(!cache->refcount); cache->refcount++; return cache; } - ovl_set_dir_cache(dentry, NULL); + ovl_set_dir_cache(d_inode(dentry), NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) @@ -339,8 +402,9 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); + cache->root = RB_ROOT; - res = ovl_dir_read_merged(dentry, &cache->entries); + res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); @@ -348,22 +412,266 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) } cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(dentry, cache); + ovl_set_dir_cache(d_inode(dentry), cache); return cache; } +/* + * Set d_ino for upper entries. Non-upper entries should always report + * the uppermost real inode ino and should not call this function. + * + * When not all layer are on same fs, report real ino also for upper. + * + * When all layers are on the same fs, and upper has a reference to + * copy up origin, call vfs_getattr() on the overlay entry to make + * sure that d_ino will be consistent with st_ino from stat(2). + */ +static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) + +{ + struct dentry *dir = path->dentry; + struct dentry *this = NULL; + enum ovl_path_type type; + u64 ino = p->real_ino; + int err = 0; + + if (!ovl_same_sb(dir->d_sb)) + goto out; + + if (p->name[0] == '.') { + if (p->len == 1) { + this = dget(dir); + goto get; + } + if (p->len == 2 && p->name[1] == '.') { + /* we shall not be moved */ + this = dget(dir->d_parent); + goto get; + } + } + this = lookup_one_len(p->name, dir, p->len); + if (IS_ERR_OR_NULL(this) || !this->d_inode) { + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + goto fail; + } + goto out; + } + +get: + type = ovl_path_type(this); + if (OVL_TYPE_ORIGIN(type)) { + struct kstat stat; + struct path statpath = *path; + + statpath.dentry = this; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + goto fail; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + ino = stat.ino; + } + +out: + p->ino = ino; + dput(this); + return err; + +fail: + pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n", + p->name, err); + goto out; +} + +static int ovl_fill_plain(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_cache_entry *p; + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + if (p == NULL) { + rdd->err = -ENOMEM; + return -ENOMEM; + } + list_add_tail(&p->l_node, rdd->list); + + return 0; +} + +static int ovl_dir_read_impure(struct path *path, struct list_head *list, + struct rb_root *root) +{ + int err; + struct path realpath; + struct ovl_cache_entry *p, *n; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_plain, + .list = list, + .root = root, + }; + + INIT_LIST_HEAD(list); + *root = RB_ROOT; + ovl_path_upper(path->dentry, &realpath); + + err = ovl_dir_read(&realpath, &rdd); + if (err) + return err; + + list_for_each_entry_safe(p, n, list, l_node) { + if (strcmp(p->name, ".") != 0 && + strcmp(p->name, "..") != 0) { + err = ovl_cache_update_ino(path, p); + if (err) + return err; + } + if (p->ino == p->real_ino) { + list_del(&p->l_node); + kfree(p); + } else { + struct rb_node **newp = &root->rb_node; + struct rb_node *parent = NULL; + + if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, + &newp, &parent))) + return -EIO; + + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, root); + } + } + return 0; +} + +static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) +{ + int res; + struct dentry *dentry = path->dentry; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(d_inode(dentry)); + if (cache && ovl_dentry_version_get(dentry) == cache->version) + return cache; + + /* Impure cache is not refcounted, free it here */ + ovl_dir_cache_free(d_inode(dentry)); + ovl_set_dir_cache(d_inode(dentry), NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + res = ovl_dir_read_impure(path, &cache->entries, &cache->root); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + if (list_empty(&cache->entries)) { + /* Good oportunity to get rid of an unnecessary "impure" flag */ + ovl_do_removexattr(ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); + ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + kfree(cache); + return NULL; + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(d_inode(dentry), cache); + + return cache; +} + +struct ovl_readdir_translate { + struct dir_context *orig_ctx; + struct ovl_dir_cache *cache; + struct dir_context ctx; + u64 parent_ino; +}; + +static int ovl_fill_real(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_translate *rdt = + container_of(ctx, struct ovl_readdir_translate, ctx); + struct dir_context *orig_ctx = rdt->orig_ctx; + + if (rdt->parent_ino && strcmp(name, "..") == 0) + ino = rdt->parent_ino; + else if (rdt->cache) { + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); + if (p) + ino = p->ino; + } + + return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); +} + +static int ovl_iterate_real(struct file *file, struct dir_context *ctx) +{ + int err; + struct ovl_dir_file *od = file->private_data; + struct dentry *dir = file->f_path.dentry; + struct ovl_readdir_translate rdt = { + .ctx.actor = ovl_fill_real, + .orig_ctx = ctx, + }; + + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { + struct kstat stat; + struct path statpath = file->f_path; + + statpath.dentry = dir->d_parent; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + return err; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + rdt.parent_ino = stat.ino; + } + + if (ovl_test_flag(OVL_IMPURE, d_inode(dir))) { + rdt.cache = ovl_cache_get_impure(&file->f_path); + if (IS_ERR(rdt.cache)) + return PTR_ERR(rdt.cache); + } + + return iterate_dir(od->realfile, &rdt.ctx); +} + + static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_cache_entry *p; + int err; if (!ctx->pos) ovl_dir_reset(file); - if (od->is_real) + if (od->is_real) { + /* + * If parent is merge, then need to adjust d_ino for '..', if + * dir is impure then need to adjust d_ino for copied up + * entries. + */ + if (ovl_same_sb(dentry->d_sb) && + (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) { + return ovl_iterate_real(file, ctx); + } return iterate_dir(od->realfile, ctx); + } if (!od->cache) { struct ovl_dir_cache *cache; @@ -378,9 +686,15 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); - if (!p->is_whiteout) + if (!p->is_whiteout) { + if (!p->ino) { + err = ovl_cache_update_ino(&file->f_path, p); + if (err) + return err; + } if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; + } od->cursor = p->l_node.next; ctx->pos++; } @@ -522,8 +836,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; struct ovl_cache_entry *p; + struct rb_root root = RB_ROOT; - err = ovl_dir_read_merged(dentry, list); + err = ovl_dir_read_merged(dentry, list, &root); if (err) return err; @@ -612,12 +927,13 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level) int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); + struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = NULL, .list = &list, - .root = RB_ROOT, + .root = &root, .is_lowest = false, }; @@ -675,12 +991,13 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, struct inode *dir = dentry->d_inode; struct path path = { .mnt = mnt, .dentry = dentry }; LIST_HEAD(list); + struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = NULL, .list = &list, - .root = RB_ROOT, + .root = &root, .is_lowest = false, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d86e89f97201..fd5ea4facc62 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -70,20 +70,20 @@ static int ovl_check_append_only(struct inode *inode, int flag) static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode, - unsigned int open_flags) + unsigned int open_flags, unsigned int flags) { struct dentry *real; int err; + if (flags & D_REAL_UPPER) + return ovl_dentry_upper(dentry); + if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) return dentry; goto bug; } - if (d_is_negative(dentry)) - return dentry; - if (open_flags) { err = ovl_open_maybe_copy_up(dentry, open_flags); if (err) @@ -105,7 +105,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry, goto bug; /* Handle recursion */ - real = d_real(real, inode, open_flags); + real = d_real(real, inode, open_flags, 0); if (!inode || inode == d_inode(real)) return real; @@ -198,6 +198,7 @@ static void ovl_destroy_inode(struct inode *inode) dput(oi->__upperdentry); kfree(oi->redirect); + ovl_dir_cache_free(inode); mutex_destroy(&oi->lock); call_rcu(&inode->i_rcu, ovl_i_callback); @@ -869,7 +870,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_free_config; /* Upper fs should not be r/o */ - if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) { + if (sb_rdonly(upperpath.mnt->mnt_sb)) { pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); err = -EINVAL; goto out_put_upperpath; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f46ad75dc96a..117794582f9f 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -180,14 +180,14 @@ struct inode *ovl_inode_real(struct inode *inode) } -struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) { - return OVL_I(d_inode(dentry))->cache; + return OVL_I(inode)->cache; } -void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) { - OVL_I(d_inode(dentry))->cache = cache; + OVL_I(inode)->cache = cache; } bool ovl_dentry_is_opaque(struct dentry *dentry) @@ -275,12 +275,19 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) } } -void ovl_dentry_version_inc(struct dentry *dentry) +void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) { struct inode *inode = d_inode(dentry); WARN_ON(!inode_is_locked(inode)); - OVL_I(inode)->version++; + /* + * Version is used by readdir code to keep cache consistent. For merge + * dirs all changes need to be noted. For non-merge dirs, cache only + * contains impure (ones which have been copied up and have origins) + * entries, so only need to note changes to impure entries. + */ + if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity) + OVL_I(inode)->version++; } u64 ovl_dentry_version_get(struct dentry *dentry) @@ -382,6 +389,11 @@ void ovl_set_flag(unsigned long flag, struct inode *inode) set_bit(flag, &OVL_I(inode)->flags); } +void ovl_clear_flag(unsigned long flag, struct inode *inode) +{ + clear_bit(flag, &OVL_I(inode)->flags); +} + bool ovl_test_flag(unsigned long flag, struct inode *inode) { return test_bit(flag, &OVL_I(inode)->flags); diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c355574aa0..525157ca25cb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -62,6 +62,7 @@ #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> +#include <linux/sched/task_stack.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/proc_fs.h> @@ -421,7 +422,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). + * + * The only exception is if the task is core dumping because + * a program is not able to use ptrace(2) in that case. It is + * safe because the task has stopped executing permanently. */ + if (permitted && (task->flags & PF_DUMPCORE)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } } get_task_comm(tcomm, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index e5d89a0d0b8a..ad3b0762cc3e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -232,7 +232,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, goto out_mmput; } - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) { rv = -ENOMEM; goto out_mmput; @@ -813,7 +813,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mm) return 0; - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; @@ -918,7 +918,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mm || !mm->env_end) return 0; - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; @@ -1630,7 +1630,7 @@ out: static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char*)__get_free_page(GFP_TEMPORARY); + char *tmp = (char *)__get_free_page(GFP_KERNEL); char *pathname; int len; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7b40e11ede9b..5589b4bd4b85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1474,7 +1474,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index dea90b566a6e..b00b766098fa 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -145,7 +145,6 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, int is_pid) { struct mm_struct *mm = vma->vm_mm; - struct proc_maps_private *priv = m->private; unsigned long ino = 0; struct file *file; dev_t dev = 0; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index b5713fefb4c1..99dff222fe67 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -178,7 +178,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) } else { mangle(m, r->mnt_devname ? r->mnt_devname : "none"); } - seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); + seq_puts(m, sb_rdonly(sb) ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) goto out; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 07e08c7d05ca..a9c5dfe6b83e 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -753,7 +753,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_XGETNEXTQUOTA: return quota_getnextxquota(sb, type, id, addr); case Q_XQUOTASYNC: - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; /* XFS quotas are fully coherent now, making this call a noop */ return 0; diff --git a/fs/read_write.c b/fs/read_write.c index 61b58c7b6531..a2b9a47235c5 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -413,7 +413,20 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, else return -EINVAL; } -EXPORT_SYMBOL(__vfs_read); + +ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + ssize_t result; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + result = vfs_read(file, (void __user *)buf, count, pos); + set_fs(old_fs); + return result; +} +EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { @@ -441,8 +454,6 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return ret; } -EXPORT_SYMBOL(vfs_read); - static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; @@ -471,9 +482,8 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, else return -EINVAL; } -EXPORT_SYMBOL(__vfs_write); -ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; const char __user *p; @@ -496,9 +506,24 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t inc_syscw(current); return ret; } - EXPORT_SYMBOL(__kernel_write); +ssize_t kernel_write(struct file *file, const void *buf, size_t count, + loff_t *pos) +{ + mm_segment_t old_fs; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_write(file, (__force const char __user *)buf, count, pos); + set_fs(old_fs); + + return res; +} +EXPORT_SYMBOL(kernel_write); + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; @@ -527,8 +552,6 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ return ret; } -EXPORT_SYMBOL(vfs_write); - static inline loff_t file_pos_read(struct file *file) { return file->f_pos; @@ -959,9 +982,8 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, return ret; } -EXPORT_SYMBOL(vfs_readv); -ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, +static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; @@ -978,7 +1000,6 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, } return ret; } -EXPORT_SYMBOL(vfs_writev); static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 873fc04e9403..11a48affa882 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1776,7 +1776,7 @@ int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct reiserfs_transaction_handle th; int jbegin_count = 1; - if (inode->i_sb->s_flags & MS_RDONLY) + if (sb_rdonly(inode->i_sb)) return -EROFS; /* * memory pressure can sometimes initiate write_inode calls with diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 0c882a0e2a6e..f59c667df15b 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1918,7 +1918,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, * we only want to flush out transactions if we were * called with error == 0 */ - if (!error && !(sb->s_flags & MS_RDONLY)) { + if (!error && !sb_rdonly(sb)) { /* end the current trans */ BUG_ON(!th->t_trans_id); do_journal_end(th, FLUSH_ALL); diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 4f3f928076f3..64f49cafbc5b 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -386,7 +386,7 @@ void __reiserfs_error(struct super_block *sb, const char *id, printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n", sb->s_id, function, error_buf); - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; reiserfs_info(sb, "Remounting filesystem read-only\n"); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 306e4e9d172d..5464ec517702 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -121,7 +121,7 @@ void reiserfs_schedule_old_flush(struct super_block *s) * Avoid scheduling flush when sb is being shut down. It can race * with journal shutdown and free still queued delayed work. */ - if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE)) + if (sb_rdonly(s) || !(s->s_flags & MS_ACTIVE)) return; spin_lock(&sbi->old_work_lock); @@ -151,7 +151,7 @@ static int reiserfs_freeze(struct super_block *s) reiserfs_cancel_old_flush(s); reiserfs_write_lock(s); - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { int err = journal_begin(&th, s, 1); if (err) { reiserfs_block_writes(&th); @@ -599,7 +599,7 @@ static void reiserfs_put_super(struct super_block *s) * change file system state to current state if it was mounted * with read-write permissions */ - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { if (!journal_begin(&th, s, 10)) { reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -700,7 +700,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) int err = 0; - if (inode->i_sb->s_flags & MS_RDONLY) { + if (sb_rdonly(inode->i_sb)) { reiserfs_warning(inode->i_sb, "clm-6006", "writing inode %lu on readonly FS", inode->i_ino); @@ -1525,7 +1525,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); /* remount read-only */ - if (s->s_flags & MS_RDONLY) + if (sb_rdonly(s)) /* it is read-only already */ goto out_ok_unlocked; @@ -1551,7 +1551,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); } else { /* remount read-write */ - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); goto out_ok_unlocked; /* We are read-write already */ @@ -1855,7 +1855,7 @@ static int what_hash(struct super_block *s) * the super */ if (code != UNSET_HASH && - !(s->s_flags & MS_RDONLY) && + !sb_rdonly(s) && code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); } @@ -2052,7 +2052,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (replay_only(s)) goto error_unlocked; - if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { + if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) { SWARN(silent, s, "clm-7000", "Detected readonly device, marking FS readonly"); s->s_flags |= MS_RDONLY; @@ -2101,7 +2101,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) else set_bit(REISERFS_3_6, &sbi->s_properties); - if (!(s->s_flags & MS_RDONLY)) { + if (!sb_rdonly(s)) { errval = journal_begin(&th, s, 1); if (errval) { diff --git a/fs/splice.c b/fs/splice.c index ae41201d0325..f3084cce0ea6 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -364,22 +364,6 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec, return res; } -ssize_t kernel_write(struct file *file, const char *buf, size_t count, - loff_t pos) -{ - mm_segment_t old_fs; - ssize_t res; - - old_fs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_write(file, (__force const char __user *)buf, count, &pos); - set_fs(old_fs); - - return res; -} -EXPORT_SYMBOL(kernel_write); - static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index ffb093e72b6c..1adb3346b9d6 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -165,6 +165,20 @@ config SQUASHFS_XZ If unsure, say N. +config SQUASHFS_ZSTD + bool "Include support for ZSTD compressed file systems" + depends on SQUASHFS + select ZSTD_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with ZSTD compression. ZSTD gives better compression than + the default ZLIB compression, while using less CPU. + + ZSTD is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config SQUASHFS_4K_DEVBLK_SIZE bool "Use 4K device block size?" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 246a6f329d89..6655631c53ae 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -15,3 +15,4 @@ squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o +squashfs-$(CONFIG_SQUASHFS_ZSTD) += zstd_wrapper.o diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index d2bc13636f79..836639810ea0 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -65,6 +65,12 @@ static const struct squashfs_decompressor squashfs_zlib_comp_ops = { }; #endif +#ifndef CONFIG_SQUASHFS_ZSTD +static const struct squashfs_decompressor squashfs_zstd_comp_ops = { + NULL, NULL, NULL, NULL, ZSTD_COMPRESSION, "zstd", 0 +}; +#endif + static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, NULL, 0, "unknown", 0 }; @@ -75,6 +81,7 @@ static const struct squashfs_decompressor *decompressor[] = { &squashfs_lzo_comp_ops, &squashfs_xz_comp_ops, &squashfs_lzma_unsupported_comp_ops, + &squashfs_zstd_comp_ops, &squashfs_unknown_comp_ops }; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index a25713c031a5..0f5a8e4e58da 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -58,4 +58,8 @@ extern const struct squashfs_decompressor squashfs_lzo_comp_ops; extern const struct squashfs_decompressor squashfs_zlib_comp_ops; #endif +#ifdef CONFIG_SQUASHFS_ZSTD +extern const struct squashfs_decompressor squashfs_zstd_comp_ops; +#endif + #endif diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 506f4ba5b983..24d12fd14177 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -241,6 +241,7 @@ struct meta_index { #define LZO_COMPRESSION 3 #define XZ_COMPRESSION 4 #define LZ4_COMPRESSION 5 +#define ZSTD_COMPRESSION 6 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c new file mode 100644 index 000000000000..eeaabf881159 --- /dev/null +++ b/fs/squashfs/zstd_wrapper.c @@ -0,0 +1,151 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * zstd_wrapper.c + */ + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/zstd.h> +#include <linux/vmalloc.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +struct workspace { + void *mem; + size_t mem_size; + size_t window_size; +}; + +static void *zstd_init(struct squashfs_sb_info *msblk, void *buff) +{ + struct workspace *wksp = kmalloc(sizeof(*wksp), GFP_KERNEL); + + if (wksp == NULL) + goto failed; + wksp->window_size = max_t(size_t, + msblk->block_size, SQUASHFS_METADATA_SIZE); + wksp->mem_size = ZSTD_DStreamWorkspaceBound(wksp->window_size); + wksp->mem = vmalloc(wksp->mem_size); + if (wksp->mem == NULL) + goto failed; + + return wksp; + +failed: + ERROR("Failed to allocate zstd workspace\n"); + kfree(wksp); + return ERR_PTR(-ENOMEM); +} + + +static void zstd_free(void *strm) +{ + struct workspace *wksp = strm; + + if (wksp) + vfree(wksp->mem); + kfree(wksp); +} + + +static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + struct workspace *wksp = strm; + ZSTD_DStream *stream; + size_t total_out = 0; + size_t zstd_err; + int k = 0; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + + stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size); + + if (!stream) { + ERROR("Failed to initialize zstd decompressor\n"); + goto out; + } + + out_buf.size = PAGE_SIZE; + out_buf.dst = squashfs_first_page(output); + + do { + if (in_buf.pos == in_buf.size && k < b) { + int avail = min(length, msblk->devblksize - offset); + + length -= avail; + in_buf.src = bh[k]->b_data + offset; + in_buf.size = avail; + in_buf.pos = 0; + offset = 0; + } + + if (out_buf.pos == out_buf.size) { + out_buf.dst = squashfs_next_page(output); + if (out_buf.dst == NULL) { + /* Shouldn't run out of pages + * before stream is done. + */ + squashfs_finish_page(output); + goto out; + } + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + } + + total_out -= out_buf.pos; + zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf); + total_out += out_buf.pos; /* add the additional data produced */ + + if (in_buf.pos == in_buf.size && k < b) + put_bh(bh[k++]); + } while (zstd_err != 0 && !ZSTD_isError(zstd_err)); + + squashfs_finish_page(output); + + if (ZSTD_isError(zstd_err)) { + ERROR("zstd decompression error: %d\n", + (int)ZSTD_getErrorCode(zstd_err)); + goto out; + } + + if (k < b) + goto out; + + return (int)total_out; + +out: + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_zstd_comp_ops = { + .init = zstd_init, + .free = zstd_free, + .decompress = zstd_uncompress, + .id = ZSTD_COMPRESSION, + .name = "zstd", + .supported = 1 +}; diff --git a/fs/super.c b/fs/super.c index 221cfa1f4e92..166c4ee0d0ed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -360,7 +360,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock) s->s_count++; spin_unlock(&sb_lock); down_write(&s->s_umount); - if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) { + if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } @@ -390,7 +390,7 @@ bool trylock_super(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!hlist_unhashed(&sb->s_instances) && - sb->s_root && (sb->s_flags & MS_BORN)) + sb->s_root && (sb->s_flags & SB_BORN)) return true; up_read(&sb->s_umount); } @@ -419,7 +419,7 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - sb->s_flags &= ~MS_ACTIVE; + sb->s_flags &= ~SB_ACTIVE; fsnotify_unmount_inodes(sb); cgroup_writeback_umount(); @@ -472,7 +472,7 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; - if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && + if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !(type->fs_flags & FS_USERNS_MOUNT) && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -502,7 +502,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns); + s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -547,11 +547,11 @@ struct super_block *sget(struct file_system_type *type, * mount through to here so always use &init_user_ns * until that changes. */ - if (flags & MS_SUBMOUNT) + if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; /* Ensure the requestor has permissions over the target filesystem */ - if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); return sget_userns(type, test, set, flags, user_ns, data); @@ -594,7 +594,7 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (sb->s_root && (sb->s_flags & SB_BORN)) f(sb, arg); up_read(&sb->s_umount); @@ -628,7 +628,7 @@ void iterate_supers_type(struct file_system_type *type, spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (sb->s_root && (sb->s_flags & SB_BORN)) f(sb, arg); up_read(&sb->s_umount); @@ -664,7 +664,7 @@ rescan: else down_write(&sb->s_umount); /* still alive? */ - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; if (!excl) up_read(&sb->s_umount); @@ -785,7 +785,7 @@ rescan: spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ @@ -801,13 +801,13 @@ rescan: /** * do_remount_sb - asks filesystem to change mount options. * @sb: superblock in question - * @flags: numeric part of options + * @sb_flags: revised superblock flags * @data: the rest of options * @force: whether or not to force the change * * Alters the mount options of a mounted file system. */ -int do_remount_sb(struct super_block *sb, int flags, void *data, int force) +int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) { int retval; int remount_ro; @@ -816,11 +816,11 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return -EBUSY; #ifdef CONFIG_BLOCK - if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) + if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) return -EACCES; #endif - remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { @@ -831,7 +831,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; - remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); } } shrink_dcache_sb(sb); @@ -850,7 +850,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) } if (sb->s_op->remount_fs) { - retval = sb->s_op->remount_fs(sb, &flags, data); + retval = sb->s_op->remount_fs(sb, &sb_flags, data); if (retval) { if (!force) goto cancel_readonly; @@ -859,7 +859,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) sb->s_type->name, retval); } } - sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK); /* Needs to be ordered wrt mnt_is_readonly() */ smp_wmb(); sb->s_readonly_remount = 0; @@ -892,12 +892,12 @@ static void do_emergency_remount(struct work_struct *work) sb->s_count++; spin_unlock(&sb_lock); down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) && - !(sb->s_flags & MS_RDONLY)) { + if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && + !sb_rdonly(sb)) { /* * What lock protects sb->s_flags?? */ - do_remount_sb(sb, MS_RDONLY, NULL, 1); + do_remount_sb(sb, SB_RDONLY, NULL, 1); } up_write(&sb->s_umount); spin_lock(&sb_lock); @@ -1023,7 +1023,7 @@ struct dentry *mount_ns(struct file_system_type *fs_type, /* Don't allow mounting unless the caller has CAP_SYS_ADMIN * over the namespace. */ - if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!(flags & SB_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, @@ -1033,13 +1033,13 @@ struct dentry *mount_ns(struct file_system_type *fs_type, if (!sb->s_root) { int err; - err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0); if (err) { deactivate_locked_super(sb); return ERR_PTR(err); } - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; } return dget(sb->s_root); @@ -1071,7 +1071,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; - if (!(flags & MS_RDONLY)) + if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; bdev = blkdev_get_by_path(dev_name, mode, fs_type); @@ -1089,14 +1089,14 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error = -EBUSY; goto error_bdev; } - s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC, + s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC, bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; if (s->s_root) { - if ((flags ^ s->s_flags) & MS_RDONLY) { + if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); error = -EBUSY; goto error_bdev; @@ -1116,13 +1116,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); goto error; } - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; bdev->bd_super = s; } @@ -1162,12 +1162,12 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, if (IS_ERR(s)) return ERR_CAST(s); - error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); @@ -1188,12 +1188,12 @@ struct dentry *mount_single(struct file_system_type *fs_type, if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { - error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; } else { do_remount_sb(s, flags, data, 0); } @@ -1227,7 +1227,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); - sb->s_flags |= MS_BORN; + sb->s_flags |= SB_BORN; error = security_sb_kern_mount(sb, flags, secdata); if (error) @@ -1434,12 +1434,12 @@ int freeze_super(struct super_block *sb) return -EBUSY; } - if (!(sb->s_flags & MS_BORN)) { + if (!(sb->s_flags & SB_BORN)) { up_write(&sb->s_umount); return 0; /* sic - it's "nothing to do" */ } - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { /* Nothing to do really... */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); @@ -1502,7 +1502,7 @@ int thaw_super(struct super_block *sb) return -EINVAL; } - if (sb->s_flags & MS_RDONLY) { + if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; goto out; } diff --git a/fs/sync.c b/fs/sync.c index 2e3fd7d94d2d..a576aa2e6b09 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -57,7 +57,7 @@ int sync_filesystem(struct super_block *sb) /* * No point in syncing out anything if the filesystem is read-only. */ - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return 0; ret = __sync_filesystem(sb, 0); @@ -69,13 +69,13 @@ EXPORT_SYMBOL(sync_filesystem); static void sync_inodes_one_sb(struct super_block *sb, void *arg) { - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) sync_inodes_sb(sb); } static void sync_fs_one_sb(struct super_block *sb, void *arg) { - if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs) + if (!sb_rdonly(sb) && sb->s_op->sync_fs) sb->s_op->sync_fs(sb, *(int *)arg); } diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c index 921c053fc052..862c1f74a583 100644 --- a/fs/sysv/balloc.c +++ b/fs/sysv/balloc.c @@ -231,7 +231,7 @@ trust_sb: Ecount: printk("sysv_count_free_blocks: free block count was %d, " "correcting to %d\n", sb_count, count); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { *sbi->s_free_blocks = cpu_to_fs32(sbi, count); dirty_sb(sb); } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 53f1b78996dd..eb963fbb7903 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -220,7 +220,7 @@ Einval: printk("sysv_count_free_inodes: " "free inode count was %d, correcting to %d\n", sb_count, count); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count); dirty_sb(sb); } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 858fb72f9e0f..1c8bf9453a71 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -70,7 +70,7 @@ static void sysv_put_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { /* XXX ext2 also updates the state here */ mark_buffer_dirty(sbi->s_bh1); if (sbi->s_bh1 != sbi->s_bh2) diff --git a/fs/sysv/super.c b/fs/sysv/super.c index eda10959714f..0d56e486b392 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -216,7 +216,7 @@ static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { sbi->s_type = FSTYPE_AFS; sbi->s_forced_ro = 1; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { printk("SysV FS: SCO EAFS on %s detected, " "forcing read-only mode.\n", sb->s_id); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index bffadbb67e47..5496b17b959c 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1159,7 +1159,7 @@ static int mount_ubifs(struct ubifs_info *c) long long x, y; size_t sz; - c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); + c->ro_mount = !!sb_rdonly(c->vfs_sb); /* Suppress error messages while probing if MS_SILENT is set */ c->probing = !!(c->vfs_sb->s_flags & MS_SILENT); diff --git a/fs/udf/super.c b/fs/udf/super.c index 93c59630512b..99cb81d0077f 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -673,7 +673,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sbi->s_dmode = uopt.dmode; write_unlock(&sbi->s_cred_lock); - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) goto out_unlock; if (*flags & MS_RDONLY) @@ -1017,7 +1017,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition, fe = udf_iget_special(sb, &addr); if (IS_ERR(fe)) { - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); @@ -1341,7 +1341,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) * writing to it (we overwrite blocks instead of relocating * them). */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { ret = -EACCES; goto out_bh; } @@ -2205,7 +2205,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = -EINVAL; goto error_out; } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION && - !(sb->s_flags & MS_RDONLY)) { + !sb_rdonly(sb)) { ret = -EACCES; goto error_out; } @@ -2226,7 +2226,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & UDF_PART_FLAG_READ_ONLY && - !(sb->s_flags & MS_RDONLY)) { + !sb_rdonly(sb)) { ret = -EACCES; goto error_out; } @@ -2245,7 +2245,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) le16_to_cpu(ts.year), ts.month, ts.day, ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { udf_open_lvid(sb); lvid_open = true; } @@ -2332,7 +2332,7 @@ static void udf_put_super(struct super_block *sb) if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); #endif - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); udf_sb_free_partitions(sb); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 0a4f58a5073c..6440003f8ddc 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -278,7 +278,7 @@ void ufs_error (struct super_block * sb, const char * function, uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); @@ -312,7 +312,7 @@ void ufs_panic (struct super_block * sb, const char * function, uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); @@ -742,7 +742,7 @@ static void ufs_put_super(struct super_block *sb) UFSD("ENTER\n"); - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) ufs_put_super_internal(sb); cancel_delayed_work_sync(&sbi->sync_work); @@ -793,7 +793,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("ENTER\n"); #ifndef CONFIG_UFS_FS_WRITE - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); return -EROFS; } @@ -805,7 +805,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->sb = sb; - UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); + UFSD("flag %u\n", (int)(sb_rdonly(sb))); mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); @@ -902,7 +902,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=old is supported read-only\n"); sb->s_flags |= MS_RDONLY; @@ -918,7 +918,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; @@ -934,7 +934,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep-cd is supported read-only\n"); sb->s_flags |= MS_RDONLY; @@ -950,7 +950,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=openstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; @@ -965,7 +965,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=hp is supported read-only\n"); sb->s_flags |= MS_RDONLY; @@ -1273,7 +1273,7 @@ magic_found: /* * Read cylinder group structures */ - if (!(sb->s_flags & MS_RDONLY)) + if (!sb_rdonly(sb)) if (!ufs_read_cylinder_structures(sb)) goto failed; @@ -1328,7 +1328,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return -EINVAL; } - if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; diff --git a/fs/utimes.c b/fs/utimes.c index 6571d8c848a0..51edb9f9507c 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -22,7 +22,7 @@ */ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (times) { if (get_user(tv[0].tv_sec, ×->actime) || @@ -44,7 +44,7 @@ static bool nsec_valid(long nsec) return nsec >= 0 && nsec <= 999999999; } -static int utimes_common(const struct path *path, struct timespec *times) +static int utimes_common(const struct path *path, struct timespec64 *times) { int error; struct iattr newattrs; @@ -115,7 +115,7 @@ out: * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ -long do_utimes(int dfd, const char __user *filename, struct timespec *times, +long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags) { int error = -EINVAL; @@ -167,10 +167,11 @@ out: SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, struct timespec __user *, utimes, int, flags) { - struct timespec tstimes[2]; + struct timespec64 tstimes[2]; if (utimes) { - if (copy_from_user(&tstimes, utimes, sizeof(tstimes))) + if ((get_timespec64(&tstimes[0], &utimes[0]) || + get_timespec64(&tstimes[1], &utimes[1]))) return -EFAULT; /* Nothing to do, we must not even check the path. */ @@ -186,7 +187,7 @@ SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, struct timeval __user *, utimes) { struct timeval times[2]; - struct timespec tstimes[2]; + struct timespec64 tstimes[2]; if (utimes) { if (copy_from_user(×, utimes, sizeof(times))) @@ -224,7 +225,7 @@ SYSCALL_DEFINE2(utimes, char __user *, filename, COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, struct compat_utimbuf __user *, t) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t->actime) || @@ -238,11 +239,11 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (t) { - if (compat_get_timespec(&tv[0], &t[0]) || - compat_get_timespec(&tv[1], &t[1])) + if (compat_get_timespec64(&tv[0], &t[0]) || + compat_get_timespec64(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) @@ -253,7 +254,7 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || diff --git a/fs/xattr.c b/fs/xattr.c index 7b03df6b8be2..4424f7fecf14 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -23,6 +23,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> +#include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) @@ -502,10 +503,10 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (!error) { error = setxattr(f.file->f_path.dentry, name, value, size, flags); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); } fdput(f); return error; @@ -734,10 +735,10 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (!error) { error = removexattr(f.file->f_path.dentry, name); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); } fdput(f); return error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ec3e44fcf771..ebdd0bd2b261 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -259,7 +259,11 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = generic_file_read_iter(iocb, to); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -636,6 +640,9 @@ xfs_file_buffered_aio_write( int enospc = 0; int iolock; + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + write_retry: iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); @@ -912,7 +919,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_AIO_NOWAIT; + file->f_mode |= FMODE_NOWAIT; return 0; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9301c5a6060b..dcd1292664b3 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -270,7 +270,14 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #endif /* DEBUG */ #ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) + +/* + * make sure we ignore the inode flag if the filesystem doesn't have a + * configured realtime device. + */ +#define XFS_IS_REALTIME_INODE(ip) \ + (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ + (ip)->i_mount->m_rtdev_targp) #else #define XFS_IS_REALTIME_INODE(ip) (0) #endif diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index de9493253edf..a65108594a07 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -125,7 +125,7 @@ xfs_fs_set_info( struct xfs_mount *mp = XFS_M(sb); struct qc_dqblk newlim; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; @@ -175,7 +175,7 @@ xfs_quota_enable( { struct xfs_mount *mp = XFS_M(sb); - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; @@ -190,7 +190,7 @@ xfs_quota_disable( { struct xfs_mount *mp = XFS_M(sb); - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; @@ -208,7 +208,7 @@ xfs_fs_rm_xquota( struct xfs_mount *mp = XFS_M(sb); unsigned int flags = 0; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; if (XFS_IS_QUOTA_ON(mp)) @@ -279,7 +279,7 @@ xfs_fs_set_dqblk( { struct xfs_mount *mp = XFS_M(sb); - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return -EROFS; if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3008f31753df..c996f4ae4a5f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -210,7 +210,7 @@ xfs_parseargs( /* * Copy binary VFS mount flags we are interested in. */ - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) mp->m_flags |= XFS_MOUNT_RDONLY; if (sb->s_flags & MS_DIRSYNC) mp->m_flags |= XFS_MOUNT_DIRSYNC; |