diff options
Diffstat (limited to 'fs')
597 files changed, 13960 insertions, 11390 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 9da967f38387..eb3589edf485 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -93,7 +93,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) * instantiating the inode (v9fs_inode_from_fid) */ acl = get_cached_acl(inode, type); - BUG_ON(acl == ACL_NOT_CACHED); + BUG_ON(is_uncached_acl(acl)); return acl; } @@ -213,8 +213,8 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, } static int v9fs_xattr_get_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { struct v9fs_session_info *v9ses; struct posix_acl *acl; @@ -227,7 +227,7 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) return v9fs_xattr_get(dentry, handler->name, buffer, size); - acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags); + acl = v9fs_get_cached_acl(inode, handler->flags); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index e9e04376c52c..c37fb9c08970 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -153,7 +153,7 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset, * If called with zero offset, we should release * the private state assocated with the page */ - if (offset == 0 && length == PAGE_CACHE_SIZE) + if (offset == 0 && length == PAGE_SIZE) v9fs_fscache_invalidate_page(page); } @@ -166,10 +166,10 @@ static int v9fs_vfs_writepage_locked(struct page *page) struct bio_vec bvec; int err, len; - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; else - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; bvec.bv_page = page; bvec.bv_offset = 0; @@ -245,9 +245,10 @@ static int v9fs_launder_page(struct page *page) * */ static ssize_t -v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; ssize_t n; int err = 0; if (iov_iter_rw(iter) == WRITE) { @@ -271,7 +272,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, int retval = 0; struct page *page; struct v9fs_inode *v9inode; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; struct inode *inode = mapping->host; @@ -288,11 +289,11 @@ start: if (PageUptodate(page)) goto out; - if (len == PAGE_CACHE_SIZE) + if (len == PAGE_SIZE) goto out; retval = v9fs_fid_readpage(v9inode->writeback_fid, page); - page_cache_release(page); + put_page(page); if (!retval) goto start; out: @@ -313,7 +314,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, /* * zero out the rest of the area */ - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned from = pos & (PAGE_SIZE - 1); zero_user(page, from + copied, len - copied); flush_dcache_page(page); @@ -331,7 +332,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, } set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); return copied; } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 5cc00e56206e..b0405d6aac85 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -246,7 +246,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .iterate = v9fs_dir_readdir, + .iterate_shared = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, }; @@ -254,7 +254,7 @@ const struct file_operations v9fs_dir_operations = { const struct file_operations v9fs_dir_operations_dotl = { .read = generic_read_dir, .llseek = generic_file_llseek, - .iterate = v9fs_dir_readdir_dotl, + .iterate_shared = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, .fsync = v9fs_file_fsync_dotl, diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index eadc894faea2..b84c291ba1eb 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -421,8 +421,8 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); loff_t i_size; unsigned long pg_start, pg_end; - pg_start = origin >> PAGE_CACHE_SHIFT; - pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT; + pg_start = origin >> PAGE_SHIFT; + pg_end = (origin + retval - 1) >> PAGE_SHIFT; if (inode->i_mapping && inode->i_mapping->nrpages) invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3a08b3e6ff1d..f4645c515262 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1071,7 +1071,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb); + v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb); generic_fillattr(d_inode(dentry), stat); p9stat_free(st); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index bf495cedec26..de3ed8629196 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -87,7 +87,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; if (v9ses->cache) - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE; + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; if (!v9ses->cache) diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 9dd9b47a6c1a..18c62bae9591 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -138,8 +138,8 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } static int v9fs_xattr_handler_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { const char *full_name = xattr_full_name(handler, name); diff --git a/fs/Kconfig b/fs/Kconfig index 6725f59c18e6..b8fcb416be72 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -52,6 +52,7 @@ config FS_DAX_PMD depends on FS_DAX depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE + depends on BROKEN endif # BLOCK diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 2d0cbbd14cfc..72c03354c14b 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -1,6 +1,7 @@ config BINFMT_ELF bool "Kernel support for ELF binaries" depends on MMU && (BROKEN || !FRV) + select ELFCORE default y ---help--- ELF (Executable and Linkable Format) is a format for libraries and @@ -26,6 +27,7 @@ config BINFMT_ELF config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF + select ELFCORE config ARCH_BINFMT_ELF_STATE bool @@ -34,6 +36,7 @@ config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load segments of a binary to be located in memory independently of each @@ -43,6 +46,11 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. +config ELFCORE + bool + help + This option enables kernel/elfcore.o. + config CORE_DUMP_DEFAULT_ELF_HEADERS bool "Write ELF core dumps with partial segments" default y diff --git a/fs/affs/dir.c b/fs/affs/dir.c index ac4f318aafba..f1e7294381c5 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -20,7 +20,7 @@ static int affs_readdir(struct file *, struct dir_context *); const struct file_operations affs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .iterate = affs_readdir, + .iterate_shared = affs_readdir, .fsync = affs_file_fsync, }; diff --git a/fs/affs/file.c b/fs/affs/file.c index 22fc7c802d69..0deec9cc2362 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -389,12 +389,13 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) } static ssize_t -affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; if (iov_iter_rw(iter) == WRITE) { @@ -404,7 +405,7 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) return 0; } - ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, affs_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) affs_write_failed(mapping, offset + count); return ret; @@ -510,9 +511,9 @@ affs_do_readpage_ofs(struct page *page, unsigned to) pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, page->index, to); - BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_SIZE); bsize = AFFS_SB(sb)->s_data_blksize; - tmp = page->index << PAGE_CACHE_SHIFT; + tmp = page->index << PAGE_SHIFT; bidx = tmp / bsize; boff = tmp % bsize; @@ -613,10 +614,10 @@ affs_readpage_ofs(struct file *file, struct page *page) int err; pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); - to = PAGE_CACHE_SIZE; - if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { - to = inode->i_size & ~PAGE_CACHE_MASK; - memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to); + to = PAGE_SIZE; + if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) { + to = inode->i_size & ~PAGE_MASK; + memset(page_address(page) + to, 0, PAGE_SIZE - to); } err = affs_do_readpage_ofs(page, to); @@ -646,7 +647,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return err; } - index = pos >> PAGE_CACHE_SHIFT; + index = pos >> PAGE_SHIFT; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -656,10 +657,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE); + err = affs_do_readpage_ofs(page, PAGE_SIZE); if (err) { unlock_page(page); - page_cache_release(page); + put_page(page); } return err; } @@ -677,7 +678,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, u32 tmp; int written; - from = pos & (PAGE_CACHE_SIZE - 1); + from = pos & (PAGE_SIZE - 1); to = pos + len; /* * XXX: not sure if this can handle short copies (len < copied), but @@ -692,7 +693,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, bh = NULL; written = 0; - tmp = (page->index << PAGE_CACHE_SHIFT) + from; + tmp = (page->index << PAGE_SHIFT) + from; bidx = tmp / bsize; boff = tmp % bsize; if (boff) { @@ -788,13 +789,13 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, done: affs_brelse(bh); - tmp = (page->index << PAGE_CACHE_SHIFT) + from; + tmp = (page->index << PAGE_SHIFT) + from; if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; err_first_bh: unlock_page(page); - page_cache_release(page); + put_page(page); return written; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index e10e17788f06..eba541004d90 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, .release = afs_release, - .iterate = afs_readdir, + .iterate_shared = afs_readdir, .lock = afs_lock, .llseek = generic_file_llseek, }; @@ -128,7 +128,7 @@ struct afs_lookup_cookie { /* * check that a directory page is valid */ -static inline void afs_dir_check_page(struct inode *dir, struct page *page) +static inline bool afs_dir_check_page(struct inode *dir, struct page *page) { struct afs_dir_page *dbuf; loff_t latter; @@ -168,11 +168,11 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page) } SetPageChecked(page); - return; + return true; error: - SetPageChecked(page); SetPageError(page); + return false; } /* @@ -181,7 +181,7 @@ error: static inline void afs_dir_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } /* @@ -196,10 +196,10 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); if (!IS_ERR(page)) { kmap(page); - if (!PageChecked(page)) - afs_dir_check_page(dir, page); - if (PageError(page)) - goto fail; + if (unlikely(!PageChecked(page))) { + if (PageError(page) || !afs_dir_check_page(dir, page)) + goto fail; + } } return page; diff --git a/fs/afs/file.c b/fs/afs/file.c index 999bc3caec92..6344aee4ac4b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -164,7 +164,7 @@ int afs_page_filler(void *data, struct page *page) _debug("cache said ENOBUFS"); default: go_on: - offset = page->index << PAGE_CACHE_SHIFT; + offset = page->index << PAGE_SHIFT; len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); /* read the contents of the file from the server into the @@ -319,7 +319,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, BUG_ON(!PageLocked(page)); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_CACHE_SIZE) { + if (offset == 0 && length == PAGE_SIZE) { #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index ccd0b212e82a..81dd075356b9 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -93,7 +93,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) kunmap(page); out_free: - page_cache_release(page); + put_page(page); out: _leave(" = %d", ret); return ret; @@ -189,7 +189,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) buf = kmap_atomic(page); memcpy(devname, buf, size); kunmap_atomic(buf); - page_cache_release(page); + put_page(page); page = NULL; } @@ -211,7 +211,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) return mnt; error: - page_cache_release(page); + put_page(page); error_no_page: free_page((unsigned long) options); error_no_options: diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index b50642870a43..63cd9f939f19 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -65,6 +65,12 @@ static void afs_async_workfn(struct work_struct *work) call->async_workfn(call); } +static int afs_wait_atomic_t(atomic_t *p) +{ + schedule(); + return 0; +} + /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT @@ -126,13 +132,16 @@ void afs_close_socket(void) { _enter(""); + wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, + TASK_UNINTERRUPTIBLE); + _debug("no outstanding calls"); + sock_release(afs_socket); _debug("dework"); destroy_workqueue(afs_async_calls); ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); - ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0); _leave(""); } @@ -178,8 +187,6 @@ static void afs_free_call(struct afs_call *call) { _debug("DONE %p{%s} [%d]", call, call->type->name, atomic_read(&afs_outstanding_calls)); - if (atomic_dec_return(&afs_outstanding_calls) == -1) - BUG(); ASSERTCMP(call->rxcall, ==, NULL); ASSERT(!work_pending(&call->async_work)); @@ -188,6 +195,9 @@ static void afs_free_call(struct afs_call *call) kfree(call->request); kfree(call); + + if (atomic_dec_and_test(&afs_outstanding_calls)) + wake_up_atomic_t(&afs_outstanding_calls); } /* @@ -420,9 +430,11 @@ error_kill_call: } /* - * handles intercepted messages that were arriving in the socket's Rx queue - * - called with the socket receive queue lock held to ensure message ordering - * - called with softirqs disabled + * Handles intercepted messages that were arriving in the socket's Rx queue. + * + * Called from the AF_RXRPC call processor in waitqueue process context. For + * each call, it is guaranteed this will be called in order of packet to be + * delivered. */ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, struct sk_buff *skb) @@ -513,6 +525,12 @@ static void afs_deliver_to_call(struct afs_call *call) call->state = AFS_CALL_ABORTED; _debug("Rcv ABORT %u -> %d", abort_code, call->error); break; + case RXRPC_SKB_MARK_LOCAL_ABORT: + abort_code = rxrpc_kernel_get_abort_code(skb); + call->error = call->type->abort_to_error(abort_code); + call->state = AFS_CALL_ABORTED; + _debug("Loc ABORT %u -> %d", abort_code, call->error); + break; case RXRPC_SKB_MARK_NET_ERROR: call->error = -rxrpc_kernel_get_error_number(skb); call->state = AFS_CALL_ERROR; diff --git a/fs/afs/super.c b/fs/afs/super.c index 81afefe7d8a6..fbdb022b75a2 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -315,8 +315,8 @@ static int afs_fill_super(struct super_block *sb, _enter(""); /* fill in the superblock */ - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; sb->s_bdi = &as->volume->bdi; diff --git a/fs/afs/write.c b/fs/afs/write.c index dfef94f70667..65de439bdc4f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -93,10 +93,10 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, _enter(",,%llu", (unsigned long long)pos); i_size = i_size_read(&vnode->vfs_inode); - if (pos + PAGE_CACHE_SIZE > i_size) + if (pos + PAGE_SIZE > i_size) len = i_size - pos; else - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; ret = afs_vnode_fetch_data(vnode, key, pos, len, page); if (ret < 0) { @@ -123,9 +123,9 @@ int afs_write_begin(struct file *file, struct address_space *mapping, struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; struct key *key = file->private_data; - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; int ret; _enter("{%x:%u},{%lx},%u,%u", @@ -151,8 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; /* page won't leak in error case: it eventually gets cleaned off LRU */ - if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { - ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); + if (!PageUptodate(page) && len != PAGE_SIZE) { + ret = afs_fill_page(vnode, key, index << PAGE_SHIFT, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); @@ -266,7 +266,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, if (PageDirty(page)) _debug("dirtied"); unlock_page(page); - page_cache_release(page); + put_page(page); return copied; } @@ -480,7 +480,7 @@ static int afs_writepages_region(struct address_space *mapping, if (page->index > end) { *_next = index; - page_cache_release(page); + put_page(page); _leave(" = 0 [%lx]", *_next); return 0; } @@ -494,7 +494,7 @@ static int afs_writepages_region(struct address_space *mapping, if (page->mapping != mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); continue; } @@ -515,7 +515,7 @@ static int afs_writepages_region(struct address_space *mapping, ret = afs_write_back_from_locked_page(wb, page); unlock_page(page); - page_cache_release(page); + put_page(page); if (ret < 0) { _leave(" = %d", ret); return ret; @@ -551,13 +551,13 @@ int afs_writepages(struct address_space *mapping, &next); mapping->writeback_index = next; } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); + end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT); ret = afs_writepages_region(mapping, wbc, 0, end, &next); if (wbc->nr_to_write > 0) mapping->writeback_index = next; } else { - start = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + start = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; ret = afs_writepages_region(mapping, wbc, start, end, &next); } @@ -496,7 +496,12 @@ static int aio_setup_ring(struct kioctx *ctx) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) { + ctx->mmap_size = 0; + aio_free_ring(ctx); + return -EINTR; + } + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, &unused); @@ -1447,8 +1452,6 @@ rw_common: return ret; } - len = ret; - if (rw == WRITE) file_start_write(file); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 7ab923940d18..78bd80298528 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -39,7 +39,7 @@ const struct file_operations autofs4_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .iterate = dcache_readdir, + .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, .unlocked_ioctl = autofs4_root_ioctl, #ifdef CONFIG_COMPAT @@ -51,7 +51,7 @@ const struct file_operations autofs4_dir_operations = { .open = autofs4_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .iterate = dcache_readdir, + .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, }; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 103f5d7c3083..72e35b721608 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -106,8 +106,8 @@ static int bad_inode_setxattr(struct dentry *dentry, const char *name, return -EIO; } -static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) +static ssize_t bad_inode_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { return -EIO; } diff --git a/fs/befs/befs.h b/fs/befs/befs.h index 35d19e8731e3..e0f59263a96d 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -116,7 +116,7 @@ BEFS_I(const struct inode *inode) } static inline befs_blocknr_t -iaddr2blockno(struct super_block *sb, befs_inode_addr * iaddr) +iaddr2blockno(struct super_block *sb, const befs_inode_addr *iaddr) { return ((iaddr->allocation_group << BEFS_SB(sb)->ag_shift) + iaddr->start); @@ -141,7 +141,7 @@ befs_iaddrs_per_block(struct super_block *sb) } static inline int -befs_iaddr_is_empty(befs_inode_addr * iaddr) +befs_iaddr_is_empty(const befs_inode_addr *iaddr) { return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len); } diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 22c166280883..307645f9e284 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -88,15 +88,15 @@ struct befs_btree_node { static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; /* local functions */ -static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, +static int befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds, befs_btree_super * bt_super, struct befs_btree_node *this_node, befs_off_t * node_off); -static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, +static int befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, befs_btree_super * sup); -static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, +static int befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds, struct befs_btree_node *node, befs_off_t node_off); @@ -134,7 +134,7 @@ static int befs_compare_strings(const void *key1, int keylen1, * On failure, BEFS_ERR is returned. */ static int -befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, +befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, befs_btree_super * sup) { struct buffer_head *bh; @@ -193,7 +193,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, */ static int -befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, +befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds, struct befs_btree_node *node, befs_off_t node_off) { uint off = 0; @@ -247,7 +247,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, * actuall value stored with the key. */ int -befs_btree_find(struct super_block *sb, befs_data_stream * ds, +befs_btree_find(struct super_block *sb, const befs_data_stream *ds, const char *key, befs_off_t * value) { struct befs_btree_node *this_node; @@ -416,7 +416,7 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, * until the (key_no)th key is found or the tree is out of keys. */ int -befs_btree_read(struct super_block *sb, befs_data_stream * ds, +befs_btree_read(struct super_block *sb, const befs_data_stream *ds, loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize, befs_off_t * value) { @@ -548,7 +548,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, * Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY. */ static int -befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, +befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds, befs_btree_super *bt_super, struct befs_btree_node *this_node, befs_off_t * node_off) diff --git a/fs/befs/btree.h b/fs/befs/btree.h index 92e781e5f30e..f2a8f637e9e0 100644 --- a/fs/befs/btree.h +++ b/fs/befs/btree.h @@ -4,10 +4,10 @@ */ -int befs_btree_find(struct super_block *sb, befs_data_stream * ds, +int befs_btree_find(struct super_block *sb, const befs_data_stream *ds, const char *key, befs_off_t * value); -int befs_btree_read(struct super_block *sb, befs_data_stream * ds, +int befs_btree_read(struct super_block *sb, const befs_data_stream *ds, loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize, befs_off_t * value); diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index ebd50718659f..af1bc19b7c85 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -21,16 +21,16 @@ const befs_inode_addr BAD_IADDR = { 0, 0, 0 }; static int befs_find_brun_direct(struct super_block *sb, - befs_data_stream * data, + const befs_data_stream *data, befs_blocknr_t blockno, befs_block_run * run); static int befs_find_brun_indirect(struct super_block *sb, - befs_data_stream * data, + const befs_data_stream *data, befs_blocknr_t blockno, befs_block_run * run); static int befs_find_brun_dblindirect(struct super_block *sb, - befs_data_stream * data, + const befs_data_stream *data, befs_blocknr_t blockno, befs_block_run * run); @@ -45,10 +45,10 @@ static int befs_find_brun_dblindirect(struct super_block *sb, * if you don't need to know offset just set @off = NULL. */ struct buffer_head * -befs_read_datastream(struct super_block *sb, befs_data_stream * ds, +befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, befs_off_t pos, uint * off) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_block_run run; befs_blocknr_t block; /* block coresponding to pos */ @@ -87,7 +87,7 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds, * 2001-11-15 Will Dyson */ int -befs_fblock2brun(struct super_block *sb, befs_data_stream * data, +befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t fblock, befs_block_run * run) { int err; @@ -122,12 +122,12 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data, * Returns the number of bytes read */ size_t -befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, - befs_off_t len) +befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds, + void *buff, befs_off_t len) { befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { @@ -163,7 +163,7 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, */ befs_blocknr_t -befs_count_blocks(struct super_block * sb, befs_data_stream * ds) +befs_count_blocks(struct super_block *sb, const befs_data_stream *ds) { befs_blocknr_t blocks; befs_blocknr_t datablocks; /* File data blocks */ @@ -243,11 +243,11 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) 2001-11-15 Will Dyson */ static int -befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, +befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, befs_block_run * run) { int i; - befs_block_run *array = data->direct; + const befs_block_run *array = data->direct; befs_blocknr_t sum; befs_blocknr_t max_block = data->max_direct_range >> BEFS_SB(sb)->block_shift; @@ -304,7 +304,8 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, */ static int befs_find_brun_indirect(struct super_block *sb, - befs_data_stream * data, befs_blocknr_t blockno, + const befs_data_stream *data, + befs_blocknr_t blockno, befs_block_run * run) { int i, j; @@ -412,7 +413,8 @@ befs_find_brun_indirect(struct super_block *sb, */ static int befs_find_brun_dblindirect(struct super_block *sb, - befs_data_stream * data, befs_blocknr_t blockno, + const befs_data_stream *data, + befs_blocknr_t blockno, befs_block_run * run) { int dblindir_indx; @@ -427,7 +429,7 @@ befs_find_brun_dblindirect(struct super_block *sb, struct buffer_head *dbl_indir_block; struct buffer_head *indir_block; befs_block_run indir_run; - befs_disk_inode_addr *iaddr_array = NULL; + befs_disk_inode_addr *iaddr_array; struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_blocknr_t indir_start_blk = @@ -486,7 +488,6 @@ befs_find_brun_dblindirect(struct super_block *sb, iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data; indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]); brelse(dbl_indir_block); - iaddr_array = NULL; /* Read indirect block */ which_block = indir_indx / befs_iaddrs_per_block(sb); @@ -511,7 +512,6 @@ befs_find_brun_dblindirect(struct super_block *sb, iaddr_array = (befs_disk_inode_addr *) indir_block->b_data; *run = fsrun_to_cpu(sb, iaddr_array[block_indx]); brelse(indir_block); - iaddr_array = NULL; blockno_at_run_start = indir_start_blk; blockno_at_run_start += diblklen * dblindir_indx; diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h index 45e8a3c98249..91ba8203d83f 100644 --- a/fs/befs/datastream.h +++ b/fs/befs/datastream.h @@ -4,16 +4,17 @@ */ struct buffer_head *befs_read_datastream(struct super_block *sb, - befs_data_stream * ds, befs_off_t pos, - uint * off); + const befs_data_stream *ds, + befs_off_t pos, uint * off); -int befs_fblock2brun(struct super_block *sb, befs_data_stream * data, +int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t fblock, befs_block_run * run); -size_t befs_read_lsymlink(struct super_block *sb, befs_data_stream * data, +size_t befs_read_lsymlink(struct super_block *sb, const befs_data_stream *data, void *buff, befs_off_t len); -befs_blocknr_t befs_count_blocks(struct super_block *sb, befs_data_stream * ds); +befs_blocknr_t befs_count_blocks(struct super_block *sb, + const befs_data_stream *ds); extern const befs_inode_addr BAD_IADDR; diff --git a/fs/befs/io.c b/fs/befs/io.c index 7a5b4ec21c56..523c8af2d770 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -26,7 +26,7 @@ struct buffer_head * befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_blocknr_t block = 0; struct befs_sb_info *befs_sb = BEFS_SB(sb); @@ -63,7 +63,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) struct buffer_head * befs_bread(struct super_block *sb, befs_blocknr_t block) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index cc0e08252913..7da05b159ade 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep; static const struct file_operations befs_dir_operations = { .read = generic_read_dir, - .iterate = befs_readdir, + .iterate_shared = befs_readdir, .llseek = generic_file_llseek, }; @@ -155,9 +155,9 @@ befs_get_block(struct inode *inode, sector_t block, static struct dentry * befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct inode *inode = NULL; + struct inode *inode; struct super_block *sb = dir->i_sb; - befs_data_stream *ds = &BEFS_I(dir)->i_data.ds; + const befs_data_stream *ds = &BEFS_I(dir)->i_data.ds; befs_off_t offset; int ret; int utfnamelen; @@ -207,7 +207,7 @@ befs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; + const befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_off_t value; int result; size_t keysize; @@ -294,10 +294,10 @@ static void init_once(void *foo) static struct inode *befs_iget(struct super_block *sb, unsigned long ino) { - struct buffer_head *bh = NULL; - befs_inode *raw_inode = NULL; + struct buffer_head *bh; + befs_inode *raw_inode; struct befs_sb_info *befs_sb = BEFS_SB(sb); - struct befs_inode_info *befs_ino = NULL; + struct befs_inode_info *befs_ino; struct inode *inode; long ret = -EIO; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 3ec6113146c0..34a5bc2f1290 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -70,7 +70,7 @@ static int bfs_readdir(struct file *f, struct dir_context *ctx) const struct file_operations bfs_dir_operations = { .read = generic_read_dir, - .iterate = bfs_readdir, + .iterate_shared = bfs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 4c556680fa74..2fab9f130e51 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -297,7 +297,10 @@ static int load_aout_binary(struct linux_binprm * bprm) } if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + if (IS_ERR_VALUE(error)) + return error; + read_code(bprm->file, N_TXTADDR(ex), fd_offset, ex.a_text + ex.a_data); goto beyond_if; @@ -378,8 +381,10 @@ static int load_aout_library(struct file *file) "N_TXTOFF is not page aligned. Please convert library: %pD\n", file); } - vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - + retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + if (IS_ERR_VALUE(retval)) + goto out; + read_code(file, start_addr, N_TXTOFF(ex), ex.a_text + ex.a_data); retval = 0; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7d914c67a9d0..938fc4ede764 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1176,8 +1176,11 @@ static int load_elf_library(struct file *file) len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) - vm_brk(len, bss - len); + if (bss > len) { + error = vm_brk(len, bss - len); + if (BAD_ADDR(error)) + goto out_free_ph; + } error = 0; out_free_ph: @@ -2273,7 +2276,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->written)) + if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2292,7 +2295,7 @@ static int elf_core_dump(struct coredump_params *cprm) void *kaddr = kmap(page); stop = !dump_emit(cprm, kaddr, PAGE_SIZE); kunmap(page); - page_cache_release(page); + put_page(page); } else stop = !dump_skip(cprm, PAGE_SIZE); if (stop) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index b1adb92e69de..71ade0e556b7 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1533,7 +1533,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm) void *kaddr = kmap(page); res = dump_emit(cprm, kaddr, PAGE_SIZE); kunmap(page); - page_cache_release(page); + put_page(page); } else { res = dump_skip(cprm, PAGE_SIZE); } @@ -1787,7 +1787,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->written)) + if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) goto end_coredump; if (!elf_fdpic_dump_segments(cprm)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 3172c4e2f502..71ccab1d22c6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -29,6 +29,7 @@ #include <linux/log2.h> #include <linux/cleancache.h> #include <linux/dax.h> +#include <linux/badblocks.h> #include <asm/uaccess.h> #include "internal.h" @@ -50,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); +void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -162,15 +175,15 @@ static struct inode *bdev_file_inode(struct file *file) } static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) - return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, + return dax_do_io(iocb, inode, iter, blkdev_get_block, NULL, DIO_SKIP_DIO_COUNT); - return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, + return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); } @@ -331,7 +344,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); unlock_page(page); - page_cache_release(page); + put_page(page); return ret; } @@ -488,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); if (!avail) return -ERANGE; if (avail > 0 && avail & ~PAGE_MASK) @@ -497,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) } EXPORT_SYMBOL_GPL(bdev_direct_access); +/** + * bdev_dax_supported() - Check if the device supports dax for filesystem + * @sb: The superblock of the device + * @blocksize: The block size of the device + * + * This is a library function for filesystems to check if the block device + * can be mounted with dax option. + * + * Return: negative errno if unsupported, 0 if supported. + */ +int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + struct blk_dax_ctl dax = { + .sector = 0, + .size = PAGE_SIZE, + }; + int err; + + if (blocksize != PAGE_SIZE) { + vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); + return -EINVAL; + } + + err = bdev_direct_access(sb->s_bdev, &dax); + if (err < 0) { + switch (err) { + case -EOPNOTSUPP: + vfs_msg(sb, KERN_ERR, + "error: device does not support dax"); + break; + case -EINVAL: + vfs_msg(sb, KERN_ERR, + "error: unaligned partition for dax"); + break; + default: + vfs_msg(sb, KERN_ERR, + "error: dax access failed (%d)", err); + } + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bdev_dax_supported); + +/** + * bdev_dax_capable() - Return if the raw device is capable for dax + * @bdev: The device for raw block device access + */ +bool bdev_dax_capable(struct block_device *bdev) +{ + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + }; + + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + + dax.sector = 0; + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + return true; +} + /* * pseudo-fs */ @@ -1149,7 +1231,7 @@ void bd_set_size(struct block_device *bdev, loff_t size) inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); inode_unlock(bdev->bd_inode); - while (bsize < PAGE_CACHE_SIZE) { + while (bsize < PAGE_SIZE) { if (size & bsize) break; bsize <<= 1; @@ -1238,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } @@ -1275,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1660,12 +1742,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); - if (ret > 0) { - ssize_t err; - err = generic_write_sync(file, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); blk_finish_plug(&plug); return ret; } @@ -1724,79 +1802,13 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; -#ifdef CONFIG_FS_DAX -/* - * In the raw block case we do not need to contend with truncation nor - * unwritten file extents. Without those concerns there is no need for - * additional locking beyond the mmap_sem context that these routines - * are already executing under. - * - * Note, there is no protection if the block device is dynamically - * resized (partition grow/shrink) during a fault. A stable block device - * size is already not enforced in the blkdev_direct_IO path. - * - * For DAX, it is the responsibility of the block device driver to - * ensure the whole-disk device size is stable while requests are in - * flight. - * - * Finally, unlike the filemap_page_mkwrite() case there is no - * filesystem superblock to sync against freezing. We still include a - * pfn_mkwrite callback for dax drivers to receive write fault - * notifications. - */ -static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - return __dax_fault(vma, vmf, blkdev_get_block, NULL); -} - -static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - return dax_pfn_mkwrite(vma, vmf); -} - -static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) -{ - return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); -} - -static const struct vm_operations_struct blkdev_dax_vm_ops = { - .fault = blkdev_dax_fault, - .pmd_fault = blkdev_dax_pmd_fault, - .pfn_mkwrite = blkdev_dax_pfn_mkwrite, -}; - -static const struct vm_operations_struct blkdev_default_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, -}; - -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(file); - - file_accessed(file); - if (IS_DAX(bd_inode)) { - vma->vm_ops = &blkdev_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; - } else { - vma->vm_ops = &blkdev_default_vm_ops; - } - - return 0; -} -#else -#define blkdev_mmap generic_file_mmap -#endif - const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = blkdev_mmap, + .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6d263bb1621c..67a607709d4f 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -63,9 +63,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - return acl; } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 80e8472d618b..d3090187fd76 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, ifp = kmalloc(sizeof(*ifp), GFP_NOFS); if (!ifp) { - kfree(fspath); + vfree(fspath); return ERR_PTR(-ENOMEM); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 61205e3bbefa..1da5753d886d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -196,6 +196,16 @@ struct btrfs_inode { struct list_head delayed_iput; long delayed_iput_count; + /* + * To avoid races between lockless (i_mutex not held) direct IO writes + * and concurrent fsync requests. Direct IO writes must acquire read + * access on this semaphore for creating an extent map and its + * corresponding ordered extent. The fast fsync path must acquire write + * access on this semaphore before it collects ordered extents and + * extent maps. + */ + struct rw_semaphore dio_sem; + struct inode vfs_inode; }; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e34a71b3e225..516e19d1d202 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -757,7 +757,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, BUG_ON(NULL == l); ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)PAGE_CACHE_SIZE) { + if (ret < (int)PAGE_SIZE) { printk(KERN_INFO "btrfsic: read @logical %llu failed!\n", tmp_next_block_ctx.start); @@ -1231,15 +1231,15 @@ static void btrfsic_read_from_block_data( size_t offset_in_page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1); + offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1); while (len > 0) { - cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); - BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE)); + cur = min(len, ((size_t)PAGE_SIZE - offset_in_page)); + BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); kaddr = block_ctx->datav[i]; memcpy(dst, kaddr + offset_in_page, cur); @@ -1605,8 +1605,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) BUG_ON(!block_ctx->datav); BUG_ON(!block_ctx->pagev); - num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> + PAGE_SHIFT; while (num_pages > 0) { num_pages--; if (block_ctx->datav[num_pages]) { @@ -1637,15 +1637,15 @@ static int btrfsic_read_block(struct btrfsic_state *state, BUG_ON(block_ctx->datav); BUG_ON(block_ctx->pagev); BUG_ON(block_ctx->mem_to_free); - if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { + if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) { printk(KERN_INFO "btrfsic: read_block() with unaligned bytenr %llu\n", block_ctx->dev_bytenr); return -1; } - num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> + PAGE_SHIFT; block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev)) * num_pages, GFP_NOFS); @@ -1676,8 +1676,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], - PAGE_CACHE_SIZE, 0); - if (PAGE_CACHE_SIZE != ret) + PAGE_SIZE, 0); + if (PAGE_SIZE != ret) break; } if (j == i) { @@ -1693,7 +1693,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } bio_put(bio); - dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + dev_bytenr += (j - i) * PAGE_SIZE; i = j; } for (i = 0; i < num_pages; i++) { @@ -1769,9 +1769,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, u32 crc = ~(u32)0; unsigned int i; - if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + if (num_pages * PAGE_SIZE < state->metablock_size) return 1; /* not metadata */ - num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1779,8 +1779,8 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, for (i = 0; i < num_pages; i++) { u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); - size_t sublen = i ? PAGE_CACHE_SIZE : - (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_SIZE : + (PAGE_SIZE - BTRFS_CSUM_SIZE); crc = btrfs_crc32c(crc, data, sublen); } @@ -1826,14 +1826,14 @@ again: if (block->is_superblock) { bytenr = btrfs_super_bytenr((struct btrfs_super_block *) mapped_datav[0]); - if (num_pages * PAGE_CACHE_SIZE < + if (num_pages * PAGE_SIZE < BTRFS_SUPER_INFO_SIZE) { printk(KERN_INFO "btrfsic: cannot work with too short bios!\n"); return; } is_metadata = 1; - BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1)); processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { @@ -1844,7 +1844,7 @@ again: } if (is_metadata) { if (!block->is_superblock) { - if (num_pages * PAGE_CACHE_SIZE < + if (num_pages * PAGE_SIZE < state->metablock_size) { printk(KERN_INFO "btrfsic: cannot work with too short bios!\n"); @@ -1880,7 +1880,7 @@ again: } block->logical_bytenr = bytenr; } else { - if (num_pages * PAGE_CACHE_SIZE < + if (num_pages * PAGE_SIZE < state->datablock_size) { printk(KERN_INFO "btrfsic: cannot work with too short bios!\n"); @@ -2013,7 +2013,7 @@ again: block->logical_bytenr = bytenr; block->is_metadata = 1; if (block->is_superblock) { - BUG_ON(PAGE_CACHE_SIZE != + BUG_ON(PAGE_SIZE != BTRFS_SUPER_INFO_SIZE); ret = btrfsic_process_written_superblock( state, @@ -2172,8 +2172,8 @@ again: continue_loop: BUG_ON(!processed_len); dev_bytenr += processed_len; - mapped_datav += processed_len >> PAGE_CACHE_SHIFT; - num_pages -= processed_len >> PAGE_CACHE_SHIFT; + mapped_datav += processed_len >> PAGE_SHIFT; + num_pages -= processed_len >> PAGE_SHIFT; goto again; } @@ -2954,7 +2954,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) goto leave; cur_bytenr = dev_bytenr; for (i = 0; i < bio->bi_vcnt; i++) { - BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE); mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); if (!mapped_datav[i]) { while (i > 0) { @@ -3037,16 +3037,16 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + if (root->nodesize & ((u64)PAGE_SIZE - 1)) { printk(KERN_INFO - "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->nodesize, PAGE_CACHE_SIZE); + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", + root->nodesize, PAGE_SIZE); return -1; } - if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + if (root->sectorsize & ((u64)PAGE_SIZE - 1)) { printk(KERN_INFO - "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->sectorsize, PAGE_CACHE_SIZE); + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", + root->sectorsize, PAGE_SIZE); return -1; } state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 3346cd8f9910..658c39b70fba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -119,7 +119,7 @@ static int check_compressed_csum(struct inode *inode, csum = ~(u32)0; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE); + csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); btrfs_csum_final(csum, (char *)&csum); kunmap_atomic(kaddr); @@ -190,7 +190,7 @@ csum_failed: for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; page->mapping = NULL; - page_cache_release(page); + put_page(page); } /* do io completion on the original bio */ @@ -224,8 +224,8 @@ out: static noinline void end_compressed_writeback(struct inode *inode, const struct compressed_bio *cb) { - unsigned long index = cb->start >> PAGE_CACHE_SHIFT; - unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT; + unsigned long index = cb->start >> PAGE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; int i; @@ -247,7 +247,7 @@ static noinline void end_compressed_writeback(struct inode *inode, if (cb->errors) SetPageError(pages[i]); end_page_writeback(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -304,7 +304,7 @@ static void end_compressed_bio_write(struct bio *bio) for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; page->mapping = NULL; - page_cache_release(page); + put_page(page); } /* finally free the cb struct */ @@ -341,7 +341,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, int ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); if (!cb) return -ENOMEM; @@ -374,14 +374,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, - PAGE_CACHE_SIZE, + PAGE_SIZE, bio, 0); else ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + PAGE_SIZE) { bio_get(bio); /* @@ -410,15 +410,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(!bio); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_add_page(bio, page, PAGE_SIZE, 0); } - if (bytes_left < PAGE_CACHE_SIZE) { + if (bytes_left < PAGE_SIZE) { btrfs_info(BTRFS_I(inode)->root->fs_info, "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } - bytes_left -= PAGE_CACHE_SIZE; - first_byte += PAGE_CACHE_SIZE; + bytes_left -= PAGE_SIZE; + first_byte += PAGE_SIZE; cond_resched(); } bio_get(bio); @@ -457,17 +457,17 @@ static noinline int add_ra_bio_pages(struct inode *inode, int misses = 0; page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; - last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + last_offset = (page_offset(page) + PAGE_SIZE); em_tree = &BTRFS_I(inode)->extent_tree; tree = &BTRFS_I(inode)->io_tree; if (isize == 0) return 0; - end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (last_offset < compressed_end) { - pg_index = last_offset >> PAGE_CACHE_SHIFT; + pg_index = last_offset >> PAGE_SHIFT; if (pg_index > end_index) break; @@ -488,11 +488,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - page_cache_release(page); + put_page(page); goto next; } - end = last_offset + PAGE_CACHE_SIZE - 1; + end = last_offset + PAGE_SIZE - 1; /* * at this point, we have a locked page in the page cache * for these bytes in the file. But, we have to make @@ -502,27 +502,27 @@ static noinline int add_ra_bio_pages(struct inode *inode, lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); read_unlock(&em_tree->lock); if (!em || last_offset < em->start || - (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (last_offset + PAGE_SIZE > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, last_offset, end); unlock_page(page); - page_cache_release(page); + put_page(page); break; } free_extent_map(em); if (page->index == end_index) { char *userpage; - size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + size_t zero_offset = isize & (PAGE_SIZE - 1); if (zero_offset) { int zeros; - zeros = PAGE_CACHE_SIZE - zero_offset; + zeros = PAGE_SIZE - zero_offset; userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, zeros); flush_dcache_page(page); @@ -531,19 +531,19 @@ static noinline int add_ra_bio_pages(struct inode *inode, } ret = bio_add_page(cb->orig_bio, page, - PAGE_CACHE_SIZE, 0); + PAGE_SIZE, 0); - if (ret == PAGE_CACHE_SIZE) { + if (ret == PAGE_SIZE) { nr_pages++; - page_cache_release(page); + put_page(page); } else { unlock_extent(tree, last_offset, end); unlock_page(page); - page_cache_release(page); + put_page(page); break; } next: - last_offset += PAGE_CACHE_SIZE; + last_offset += PAGE_SIZE; } return 0; } @@ -567,7 +567,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; struct btrfs_root *root = BTRFS_I(inode)->root; - unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE; unsigned long compressed_len; unsigned long nr_pages; unsigned long pg_index; @@ -589,7 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, page_offset(bio->bi_io_vec->bv_page), - PAGE_CACHE_SIZE); + PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) return -EIO; @@ -617,7 +617,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); + nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) @@ -640,7 +640,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ - uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + uncompressed_len = bio->bi_vcnt * PAGE_SIZE; cb->len = uncompressed_len; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); @@ -653,18 +653,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, for (pg_index = 0; pg_index < nr_pages; pg_index++) { page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; - page->index = em_start >> PAGE_CACHE_SHIFT; + page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) ret = tree->ops->merge_bio_hook(READ, page, 0, - PAGE_CACHE_SIZE, + PAGE_SIZE, comp_bio, 0); else ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + PAGE_SIZE) { bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, @@ -702,9 +702,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + bio_add_page(comp_bio, page, PAGE_SIZE, 0); } - cur_disk_byte += PAGE_CACHE_SIZE; + cur_disk_byte += PAGE_SIZE; } bio_get(comp_bio); @@ -743,8 +743,11 @@ out: static struct { struct list_head idle_ws; spinlock_t ws_lock; - int num_ws; - atomic_t alloc_ws; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ wait_queue_head_t ws_wait; } btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; @@ -758,16 +761,34 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); + + /* + * Preallocate one workspace for each compression type so + * we can guarantee forward progress in the worst case + */ + workspace = btrfs_compress_op[i]->alloc_workspace(); + if (IS_ERR(workspace)) { + printk(KERN_WARNING + "BTRFS: cannot preallocate compression workspace, will try later"); + } else { + atomic_set(&btrfs_comp_ws[i].total_ws, 1); + btrfs_comp_ws[i].free_ws = 1; + list_add(workspace, &btrfs_comp_ws[i].idle_ws); + } } } /* - * this finds an available workspace or allocates a new one - * ERR_PTR is returned if things go bad. + * This finds an available workspace or allocates a new one. + * If it's not possible to allocate a new one, waits until there's one. + * Preallocation makes a forward progress guarantees and we do not return + * errors. */ static struct list_head *find_workspace(int type) { @@ -777,36 +798,58 @@ static struct list_head *find_workspace(int type) struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { workspace = idle_ws->next; list_del(workspace); - (*num_ws)--; + (*free_ws)--; spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_ws) > cpus) { + if (atomic_read(total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(ws_lock); prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_ws) > cpus && !*num_ws) + if (atomic_read(total_ws) > cpus && !*free_ws) schedule(); finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_ws); + atomic_inc(total_ws); spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake_up(ws_wait); + + /* + * Do not return the error but go back to waiting. There's a + * workspace preallocated for each type and the compression + * time is bounded so we get to a workspace eventually. This + * makes our caller's life easier. + * + * To prevent silent and low-probability deadlocks (when the + * initial preallocation fails), check if there are any + * workspaces at all. + */ + if (atomic_read(total_ws) == 0) { + static DEFINE_RATELIMIT_STATE(_rs, + /* once per minute */ 60 * HZ, + /* no burst */ 1); + + if (__ratelimit(&_rs)) { + printk(KERN_WARNING + "no compression workspaces, low memory, retrying"); + } + } + goto again; } return workspace; } @@ -820,21 +863,21 @@ static void free_workspace(int type, struct list_head *workspace) int idx = type - 1; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; spin_lock(ws_lock); - if (*num_ws < num_online_cpus()) { + if (*free_ws < num_online_cpus()) { list_add(workspace, idle_ws); - (*num_ws)++; + (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake: /* * Make sure counter is updated before we wake up waiters. @@ -857,7 +900,7 @@ static void free_workspaces(void) workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].alloc_ws); + atomic_dec(&btrfs_comp_ws[i].total_ws); } } } @@ -894,8 +937,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -930,8 +971,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -952,8 +991,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, @@ -1013,8 +1050,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - *pg_offset, - PAGE_CACHE_SIZE - buf_offset); + bytes = min(PAGE_SIZE - *pg_offset, + PAGE_SIZE - buf_offset); bytes = min(bytes, working_bytes); kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); @@ -1027,7 +1064,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, current_buf_start += bytes; /* check if we need to pick another page */ - if (*pg_offset == PAGE_CACHE_SIZE) { + if (*pg_offset == PAGE_SIZE) { (*pg_index)++; if (*pg_index >= vcnt) return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 77592931ab4f..decd0a3f5d61 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1010,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1927,7 +1928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -2030,7 +2031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -5361,10 +5362,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); + tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN); if (!tmp_buf) { - ret = -ENOMEM; - goto out; + tmp_buf = vmalloc(left_root->nodesize); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } } left_path->search_commit_root = 1; @@ -5565,7 +5569,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, out: btrfs_free_path(left_path); btrfs_free_path(right_path); - kfree(tmp_buf); + kvfree(tmp_buf); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..ddcc58f03c79 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/btrfs_tree.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/sizes.h> @@ -64,98 +65,6 @@ struct btrfs_ordered_sum; #define BTRFS_COMPAT_EXTENT_TREE_V0 -/* holds pointers to all of the tree roots */ -#define BTRFS_ROOT_TREE_OBJECTID 1ULL - -/* stores information about which extents are in use, and reference counts */ -#define BTRFS_EXTENT_TREE_OBJECTID 2ULL - -/* - * chunk tree stores translations from logical -> physical block numbering - * the super block points to the chunk tree - */ -#define BTRFS_CHUNK_TREE_OBJECTID 3ULL - -/* - * stores information about which areas of a given device are in use. - * one per device. The tree of tree roots points to the device tree - */ -#define BTRFS_DEV_TREE_OBJECTID 4ULL - -/* one per subvolume, storing files and directories */ -#define BTRFS_FS_TREE_OBJECTID 5ULL - -/* directory objectid inside the root tree */ -#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL - -/* holds checksums of all the data extents */ -#define BTRFS_CSUM_TREE_OBJECTID 7ULL - -/* holds quota configuration and tracking */ -#define BTRFS_QUOTA_TREE_OBJECTID 8ULL - -/* for storing items that use the BTRFS_UUID_KEY* types */ -#define BTRFS_UUID_TREE_OBJECTID 9ULL - -/* tracks free space in block groups. */ -#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL - -/* device stats in the device tree */ -#define BTRFS_DEV_STATS_OBJECTID 0ULL - -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - -/* orhpan objectid for tracking unlinked/truncated files */ -#define BTRFS_ORPHAN_OBJECTID -5ULL - -/* does write ahead logging to speed up fsyncs */ -#define BTRFS_TREE_LOG_OBJECTID -6ULL -#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL - -/* for space balancing */ -#define BTRFS_TREE_RELOC_OBJECTID -8ULL -#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL - -/* - * extent checksums all have this objectid - * this allows them to share the logging tree - * for fsyncs - */ -#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL - -/* For storing free space cache */ -#define BTRFS_FREE_SPACE_OBJECTID -11ULL - -/* - * The inode number assigned to the special inode for storing - * free ino cache - */ -#define BTRFS_FREE_INO_OBJECTID -12ULL - -/* dummy objectid represents multiple objectids */ -#define BTRFS_MULTIPLE_OBJECTIDS -255ULL - -/* - * All files have objectids in this range. - */ -#define BTRFS_FIRST_FREE_OBJECTID 256ULL -#define BTRFS_LAST_FREE_OBJECTID -256ULL -#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL - - -/* - * the device items go into the chunk tree. The key is in the form - * [ 1 BTRFS_DEV_ITEM_KEY device_id ] - */ -#define BTRFS_DEV_ITEMS_OBJECTID 1ULL - -#define BTRFS_BTREE_INODE_OBJECTID 1 - -#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 - -#define BTRFS_DEV_REPLACE_DEVID 0ULL - /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. @@ -175,12 +84,6 @@ struct btrfs_ordered_sum; */ #define BTRFS_LINK_MAX 65535U -/* 32 bytes in various csum fields */ -#define BTRFS_CSUM_SIZE 32 - -/* csum types */ -#define BTRFS_CSUM_TYPE_CRC32 0 - static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ @@ -189,17 +92,6 @@ static const int btrfs_csum_sizes[] = { 4 }; /* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ #define REQ_GET_READ_MIRRORS (1 << 30) -#define BTRFS_FT_UNKNOWN 0 -#define BTRFS_FT_REG_FILE 1 -#define BTRFS_FT_DIR 2 -#define BTRFS_FT_CHRDEV 3 -#define BTRFS_FT_BLKDEV 4 -#define BTRFS_FT_FIFO 5 -#define BTRFS_FT_SOCK 6 -#define BTRFS_FT_SYMLINK 7 -#define BTRFS_FT_XATTR 8 -#define BTRFS_FT_MAX 9 - /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) @@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_MAX_EXTENT_SIZE SZ_128M -/* - * The key defines the order in the tree, and so it also defines (optimal) - * block layout. - * - * objectid corresponds to the inode number. - * - * type tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with type of 1 might refer to the inode data, - * type of 2 may point to file data in the btree and type == 3 may point to - * extents. - * - * offset is the starting byte offset for this key in the stream. - * - * btrfs_disk_key is in disk byte order. struct btrfs_key is always - * in cpu native order. Otherwise they are identical and their sizes - * should be the same (ie both packed) - */ -struct btrfs_disk_key { - __le64 objectid; - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -struct btrfs_key { - u64 objectid; - u8 type; - u64 offset; -} __attribute__ ((__packed__)); - struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; -struct btrfs_dev_item { - /* the internal btrfs device id */ - __le64 devid; - - /* size of the device */ - __le64 total_bytes; - - /* bytes used */ - __le64 bytes_used; - - /* optimal io alignment for this device */ - __le32 io_align; - - /* optimal io width for this device */ - __le32 io_width; - - /* minimal io size for this device */ - __le32 sector_size; - - /* type and info about this device */ - __le64 type; - - /* expected generation for this device */ - __le64 generation; - - /* - * starting byte of this partition on the device, - * to allow for stripe alignment in the future - */ - __le64 start_offset; - - /* grouping information for allocation decisions */ - __le32 dev_group; - - /* seek speed 0-100 where 100 is fastest */ - u8 seek_speed; - - /* bandwidth 0-100 where 100 is fastest */ - u8 bandwidth; - - /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_UUID_SIZE]; - - /* uuid of FS who owns this device */ - u8 fsid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_stripe { - __le64 devid; - __le64 offset; - u8 dev_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_chunk { - /* size of this chunk in bytes */ - __le64 length; - - /* objectid of the root referencing this chunk */ - __le64 owner; - - __le64 stripe_len; - __le64 type; - - /* optimal io alignment for this chunk */ - __le32 io_align; - - /* optimal io width for this chunk */ - __le32 io_width; - - /* minimal io size for this chunk */ - __le32 sector_size; - - /* 2^16 stripes is quite a lot, a second limit is the size of a single - * item in the btree - */ - __le16 num_stripes; - - /* sub stripes only matter for raid10 */ - __le16 sub_stripes; - struct btrfs_stripe stripe; - /* additional stripes go here */ -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_EXTENT 1 -#define BTRFS_FREE_SPACE_BITMAP 2 - -struct btrfs_free_space_entry { - __le64 offset; - __le64 bytes; - u8 type; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_header { - struct btrfs_disk_key location; - __le64 generation; - __le64 num_entries; - __le64 num_bitmaps; -} __attribute__ ((__packed__)); - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) -#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) - /* * File system states */ @@ -357,13 +118,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 -/* Super block flags */ -/* Errors detected */ -#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) - -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) -#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) - #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 #define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ @@ -410,7 +164,6 @@ struct btrfs_header { * room to translate 14 chunks with 3 stripes each. */ #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 -#define BTRFS_LABEL_SIZE 256 /* * just in case we somehow lose the roots and are not able to mount, @@ -507,31 +260,6 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) - -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) -#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) - -/* - * older kernels tried to do bigger metadata blocks, but the - * code was pretty buggy. Lets not let them try anymore. - */ -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) - -#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) -#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) -#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) -#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) - #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL @@ -624,357 +352,8 @@ struct btrfs_path { unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; }; - -/* - * items in the extent btree are used to record the objectid of the - * owner of the block and the number of references - */ - -struct btrfs_extent_item { - __le64 refs; - __le64 generation; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_extent_item_v0 { - __le32 refs; -} __attribute__ ((__packed__)); - #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ sizeof(struct btrfs_item)) - -#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) -#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) - -/* following flags only apply to tree blocks */ - -/* use full backrefs for extent pointers in the block */ -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) - -/* - * this flag is only used internally by scrub and may be changed at any time - * it is only declared here to avoid collisions - */ -#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) - -struct btrfs_tree_block_info { - struct btrfs_disk_key key; - u8 level; -} __attribute__ ((__packed__)); - -struct btrfs_extent_data_ref { - __le64 root; - __le64 objectid; - __le64 offset; - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_shared_data_ref { - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_extent_inline_ref { - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - -/* dev extents record free space on individual devices. The owner - * field points back to the chunk allocation mapping tree that allocated - * the extent. The chunk tree uuid field is a way to double check the owner - */ -struct btrfs_dev_extent { - __le64 chunk_tree; - __le64 chunk_objectid; - __le64 chunk_offset; - __le64 length; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_inode_ref { - __le64 index; - __le16 name_len; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_inode_extref { - __le64 parent_objectid; - __le64 index; - __le16 name_len; - __u8 name[0]; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_timespec { - __le64 sec; - __le32 nsec; -} __attribute__ ((__packed__)); - -struct btrfs_inode_item { - /* nfs style generation number */ - __le64 generation; - /* transid that last touched this inode */ - __le64 transid; - __le64 size; - __le64 nbytes; - __le64 block_group; - __le32 nlink; - __le32 uid; - __le32 gid; - __le32 mode; - __le64 rdev; - __le64 flags; - - /* modification sequence number for NFS */ - __le64 sequence; - - /* - * a little future expansion, for more than this we can - * just grow the inode item and version it - */ - __le64 reserved[4]; - struct btrfs_timespec atime; - struct btrfs_timespec ctime; - struct btrfs_timespec mtime; - struct btrfs_timespec otime; -} __attribute__ ((__packed__)); - -struct btrfs_dir_log_item { - __le64 end; -} __attribute__ ((__packed__)); - -struct btrfs_dir_item { - struct btrfs_disk_key location; - __le64 transid; - __le16 data_len; - __le16 name_len; - u8 type; -} __attribute__ ((__packed__)); - -#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) - -/* - * Internal in-memory flag that a subvolume has been marked for deletion but - * still visible as a directory - */ -#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) - -struct btrfs_root_item { - struct btrfs_inode_item inode; - __le64 generation; - __le64 root_dirid; - __le64 bytenr; - __le64 byte_limit; - __le64 bytes_used; - __le64 last_snapshot; - __le64 flags; - __le32 refs; - struct btrfs_disk_key drop_progress; - u8 drop_level; - u8 level; - - /* - * The following fields appear after subvol_uuids+subvol_times - * were introduced. - */ - - /* - * This generation number is used to test if the new fields are valid - * and up to date while reading the root item. Every time the root item - * is written out, the "generation" field is copied into this field. If - * anyone ever mounted the fs with an older kernel, we will have - * mismatching generation values here and thus must invalidate the - * new fields. See btrfs_update_root and btrfs_find_last_root for - * details. - * the offset of generation_v2 is also used as the start for the memset - * when invalidating the fields. - */ - __le64 generation_v2; - u8 uuid[BTRFS_UUID_SIZE]; - u8 parent_uuid[BTRFS_UUID_SIZE]; - u8 received_uuid[BTRFS_UUID_SIZE]; - __le64 ctransid; /* updated when an inode changes */ - __le64 otransid; /* trans when created */ - __le64 stransid; /* trans when sent. non-zero for received subvol */ - __le64 rtransid; /* trans when received. non-zero for received subvol */ - struct btrfs_timespec ctime; - struct btrfs_timespec otime; - struct btrfs_timespec stime; - struct btrfs_timespec rtime; - __le64 reserved[8]; /* for future */ -} __attribute__ ((__packed__)); - -/* - * this is used for both forward and backward root refs - */ -struct btrfs_root_ref { - __le64 dirid; - __le64 sequence; - __le16 name_len; -} __attribute__ ((__packed__)); - -struct btrfs_disk_balance_args { - /* - * profiles to operate on, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 profiles; - - /* - * usage filter - * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' - * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max - */ - union { - __le64 usage; - struct { - __le32 usage_min; - __le32 usage_max; - }; - }; - - /* devid filter */ - __le64 devid; - - /* devid subset filter [pstart..pend) */ - __le64 pstart; - __le64 pend; - - /* btrfs virtual address space subset filter [vstart..vend) */ - __le64 vstart; - __le64 vend; - - /* - * profile to convert to, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 target; - - /* BTRFS_BALANCE_ARGS_* */ - __le64 flags; - - /* - * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' - * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum - * and maximum - */ - union { - __le64 limit; - struct { - __le32 limit_min; - __le32 limit_max; - }; - }; - - /* - * Process chunks that cross stripes_min..stripes_max devices, - * BTRFS_BALANCE_ARGS_STRIPES_RANGE - */ - __le32 stripes_min; - __le32 stripes_max; - - __le64 unused[6]; -} __attribute__ ((__packed__)); - -/* - * store balance parameters to disk so that balance can be properly - * resumed after crash or unmount - */ -struct btrfs_balance_item { - /* BTRFS_BALANCE_* */ - __le64 flags; - - struct btrfs_disk_balance_args data; - struct btrfs_disk_balance_args meta; - struct btrfs_disk_balance_args sys; - - __le64 unused[4]; -} __attribute__ ((__packed__)); - -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 - -struct btrfs_file_extent_item { - /* - * transaction id that created this extent - */ - __le64 generation; - /* - * max number of bytes to hold this extent in ram - * when we split a compressed extent we can't know how big - * each of the resulting pieces will be. So, this is - * an upper limit on the size of the extent in ram instead of - * an exact limit. - */ - __le64 ram_bytes; - - /* - * 32 bits for the various ways we might encode the data, - * including compression and encryption. If any of these - * are set to something a given disk format doesn't understand - * it is treated like an incompat flag for reading and writing, - * but not for stat. - */ - u8 compression; - u8 encryption; - __le16 other_encoding; /* spare for later use */ - - /* are we inline data or a real extent? */ - u8 type; - - /* - * disk space consumed by the extent, checksum blocks are included - * in these numbers - * - * At this offset in the structure, the inline extent data start. - */ - __le64 disk_bytenr; - __le64 disk_num_bytes; - /* - * the logical offset in file blocks (no csums) - * this extent record is for. This allows a file extent to point - * into the middle of an existing extent on disk, sharing it - * between two snapshots (useful if some bytes in the middle of the - * extent have changed - */ - __le64 offset; - /* - * the logical number of file blocks (no csums included). This - * always reflects the size uncompressed and without encoding. - */ - __le64 num_bytes; - -} __attribute__ ((__packed__)); - -struct btrfs_csum_item { - u8 csum; -} __attribute__ ((__packed__)); - -struct btrfs_dev_stats_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; -} __attribute__ ((__packed__)); - -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 -#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - struct btrfs_dev_replace { u64 replace_state; /* see #define above */ u64 time_started; /* seconds since 1-Jan-1970 */ @@ -1005,175 +384,6 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; -struct btrfs_dev_replace_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 src_devid; - __le64 cursor_left; - __le64 cursor_right; - __le64 cont_reading_from_srcdev_mode; - - __le64 replace_state; - __le64 time_started; - __le64 time_stopped; - __le64 num_write_errors; - __le64 num_uncorrectable_read_errors; -} __attribute__ ((__packed__)); - -/* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) -#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ - BTRFS_SPACE_INFO_GLOBAL_RSV) - -enum btrfs_raid_types { - BTRFS_RAID_RAID10, - BTRFS_RAID_RAID1, - BTRFS_RAID_DUP, - BTRFS_RAID_RAID0, - BTRFS_RAID_SINGLE, - BTRFS_RAID_RAID5, - BTRFS_RAID_RAID6, - BTRFS_NR_RAID_TYPES -}; - -#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ - BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) - -#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ - BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6 | \ - BTRFS_BLOCK_GROUP_DUP | \ - BTRFS_BLOCK_GROUP_RAID10) -#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6) - -/* - * We need a bit for restriper to be able to tell when chunks of type - * SINGLE are available. This "extended" profile format is used in - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields - * (on-disk). The corresponding on-disk bit in chunk.type is reserved - * to avoid remappings between two formats in future. - */ -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) - -/* - * A fake block group type that is used to communicate global block reserve - * size to userspace via the SPACE_INFO ioctl. - */ -#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) - -#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ - BTRFS_AVAIL_ALLOC_BIT_SINGLE) - -static inline u64 chunk_to_extended(u64 flags) -{ - if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) - flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - return flags; -} -static inline u64 extended_to_chunk(u64 flags) -{ - return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; -} - -struct btrfs_block_group_item { - __le64 used; - __le64 chunk_objectid; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_info { - __le32 extent_count; - __le32 flags; -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) - -#define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline u64 btrfs_qgroup_level(u64 qgroupid) -{ - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; -} - -/* - * is subvolume quota turned on? - */ -#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) -/* - * RESCAN is set during the initialization phase - */ -#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) -/* - * Some qgroup entries are known to be out of date, - * either because the configuration has changed in a way that - * makes a rescan necessary, or because the fs has been mounted - * with a non-qgroup-aware version. - * Turning qouta off and on again makes it inconsistent, too. - */ -#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) - -#define BTRFS_QGROUP_STATUS_VERSION 1 - -struct btrfs_qgroup_status_item { - __le64 version; - /* - * the generation is updated during every commit. As older - * versions of btrfs are not aware of qgroups, it will be - * possible to detect inconsistencies by checking the - * generation on mount time - */ - __le64 generation; - - /* flag definitions see above */ - __le64 flags; - - /* - * only used during scanning to record the progress - * of the scan. It contains a logical address - */ - __le64 rescan; -} __attribute__ ((__packed__)); - -struct btrfs_qgroup_info_item { - __le64 generation; - __le64 rfer; - __le64 rfer_cmpr; - __le64 excl; - __le64 excl_cmpr; -} __attribute__ ((__packed__)); - -/* flags definition for qgroup limits */ -#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) -#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) -#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) -#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) - -struct btrfs_qgroup_limit_item { - /* - * only updated when any of the other values change - */ - __le64 flags; - __le64 max_rfer; - __le64 max_excl; - __le64 rsv_rfer; - __le64 rsv_excl; -} __attribute__ ((__packed__)); - /* For raid type sysfs entries */ struct raid_kobject { int raid_type; @@ -1408,6 +618,27 @@ struct btrfs_block_group_cache { struct btrfs_io_ctl io_ctl; + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -2026,228 +1257,6 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; -struct btrfs_ioctl_defrag_range_args { - /* start of the defrag operation */ - __u64 start; - - /* number of bytes to defrag, use (u64)-1 to say all */ - __u64 len; - - /* - * flags for the operation, which can include turning - * on compression for this one defrag - */ - __u64 flags; - - /* - * any extent bigger than this will be considered - * already defragged. Use 0 to take the kernel default - * Use 1 to say every single extent must be rewritten - */ - __u32 extent_thresh; - - /* - * which compression method to use if turning on compression - * for this defrag operation. If unspecified, zlib will - * be used - */ - __u32 compress_type; - - /* spare for later */ - __u32 unused[4]; -}; - - -/* - * inode items have the data typically returned from stat and store other - * info about object characteristics. There is one for every file and dir in - * the FS - */ -#define BTRFS_INODE_ITEM_KEY 1 -#define BTRFS_INODE_REF_KEY 12 -#define BTRFS_INODE_EXTREF_KEY 13 -#define BTRFS_XATTR_ITEM_KEY 24 -#define BTRFS_ORPHAN_ITEM_KEY 48 -/* reserve 2-15 close to the inode for later flexibility */ - -/* - * dir items are the name -> inode pointers in a directory. There is one - * for every name in a directory. - */ -#define BTRFS_DIR_LOG_ITEM_KEY 60 -#define BTRFS_DIR_LOG_INDEX_KEY 72 -#define BTRFS_DIR_ITEM_KEY 84 -#define BTRFS_DIR_INDEX_KEY 96 -/* - * extent data is for file data - */ -#define BTRFS_EXTENT_DATA_KEY 108 - -/* - * extent csums are stored in a separate tree and hold csums for - * an entire extent on disk. - */ -#define BTRFS_EXTENT_CSUM_KEY 128 - -/* - * root items point to tree roots. They are typically in the root - * tree used by the super block to find all the other trees - */ -#define BTRFS_ROOT_ITEM_KEY 132 - -/* - * root backrefs tie subvols and snapshots to the directory entries that - * reference them - */ -#define BTRFS_ROOT_BACKREF_KEY 144 - -/* - * root refs make a fast index for listing all of the snapshots and - * subvolumes referenced by a given root. They point directly to the - * directory item in the root that references the subvol - */ -#define BTRFS_ROOT_REF_KEY 156 - -/* - * extent items are in the extent map tree. These record which blocks - * are used, and how many references there are to each block - */ -#define BTRFS_EXTENT_ITEM_KEY 168 - -/* - * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know - * the length, so we save the level in key->offset instead of the length. - */ -#define BTRFS_METADATA_ITEM_KEY 169 - -#define BTRFS_TREE_BLOCK_REF_KEY 176 - -#define BTRFS_EXTENT_DATA_REF_KEY 178 - -#define BTRFS_EXTENT_REF_V0_KEY 180 - -#define BTRFS_SHARED_BLOCK_REF_KEY 182 - -#define BTRFS_SHARED_DATA_REF_KEY 184 - -/* - * block groups give us hints into the extent allocation trees. Which - * blocks are free etc etc - */ -#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 - -/* - * Every block group is represented in the free space tree by a free space info - * item, which stores some accounting information. It is keyed on - * (block_group_start, FREE_SPACE_INFO, block_group_length). - */ -#define BTRFS_FREE_SPACE_INFO_KEY 198 - -/* - * A free space extent tracks an extent of space that is free in a block group. - * It is keyed on (start, FREE_SPACE_EXTENT, length). - */ -#define BTRFS_FREE_SPACE_EXTENT_KEY 199 - -/* - * When a block group becomes very fragmented, we convert it to use bitmaps - * instead of extents. A free space bitmap is keyed on - * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with - * (length / sectorsize) bits. - */ -#define BTRFS_FREE_SPACE_BITMAP_KEY 200 - -#define BTRFS_DEV_EXTENT_KEY 204 -#define BTRFS_DEV_ITEM_KEY 216 -#define BTRFS_CHUNK_ITEM_KEY 228 - -/* - * Records the overall state of the qgroups. - * There's only one instance of this key present, - * (0, BTRFS_QGROUP_STATUS_KEY, 0) - */ -#define BTRFS_QGROUP_STATUS_KEY 240 -/* - * Records the currently used space of the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). - */ -#define BTRFS_QGROUP_INFO_KEY 242 -/* - * Contains the user configured limits for the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). - */ -#define BTRFS_QGROUP_LIMIT_KEY 244 -/* - * Records the child-parent relationship of qgroups. For - * each relation, 2 keys are present: - * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) - * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) - */ -#define BTRFS_QGROUP_RELATION_KEY 246 - -/* - * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. - */ -#define BTRFS_BALANCE_ITEM_KEY 248 - -/* - * The key type for tree items that are stored persistently, but do not need to - * exist for extended period of time. The items can exist in any tree. - * - * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] - * - * Existing items: - * - * - balance status item - * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) - */ -#define BTRFS_TEMPORARY_ITEM_KEY 248 - -/* - * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY - */ -#define BTRFS_DEV_STATS_KEY 249 - -/* - * The key type for tree items that are stored persistently and usually exist - * for a long period, eg. filesystem lifetime. The item kinds can be status - * information, stats or preference values. The item can exist in any tree. - * - * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] - * - * Existing items: - * - * - device statistics, store IO stats in the device tree, one key for all - * stats - * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) - */ -#define BTRFS_PERSISTENT_ITEM_KEY 249 - -/* - * Persistantly stores the device replace state in the device tree. - * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). - */ -#define BTRFS_DEV_REPLACE_KEY 250 - -/* - * Stores items that allow to quickly map UUIDs to something else. - * These items are part of the filesystem UUID tree. - * The key is built like this: - * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). - */ -#if BTRFS_UUID_SIZE != 16 -#error "UUID items require BTRFS_UUID_SIZE == 16!" -#endif -#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ -#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to - * received subvols */ - -/* - * string items are for debugging. They just store a short string of - * data in the FS - */ -#define BTRFS_STRING_ITEM_KEY 253 - /* * Flags for mount options. * @@ -3499,6 +2508,12 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -4122,6 +3137,7 @@ void btrfs_test_inode_set_ops(struct inode *inode); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); @@ -4326,10 +3342,9 @@ static inline void assfail(char *expr, char *file, int line) #define ASSERT(expr) ((void)0) #endif -#define btrfs_assert() __printf(5, 6) __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); const char *btrfs_decode_error(int errno); @@ -4339,6 +3354,46 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ +} while (0) + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + + +/* compatibility and incompatibility defines */ + #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) @@ -4455,44 +3510,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) return !!(btrfs_super_compat_ro_flags(disk_super) & flag); } -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. - */ -#define btrfs_abort_transaction(trans, root, errno) \ -do { \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ - WARN(1, KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ - } \ - __btrfs_abort_transaction((trans), (root), __func__, \ - __LINE__, (errno)); \ -} while (0) - -#define btrfs_std_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_std_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6cef0062f929..61561c2a3f96 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -134,7 +134,7 @@ again: /* cached in the btrfs inode and can be accessed */ atomic_add(2, &node->refs); - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) { kmem_cache_free(delayed_node_cache, node); return ERR_PTR(ret); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a1d6652e0c47..85f12e6e28d2 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device *tgtdev); -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device); static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); static int btrfs_dev_replace_kthread(void *data); static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); @@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_root *root, - struct btrfs_ioctl_dev_replace_args *args) +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src) { struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - switch (args->start.cont_reading_from_srcdev_mode) { - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: - break; - default: - return -EINVAL; - } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, - args->start.srcdev_name, - &src_device); + ret = btrfs_find_device_by_devspec(root, srcdevid, + srcdev_name, &src_device); if (ret) { mutex_unlock(&fs_info->volume_mutex); return ret; } - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name, src_device, &tgt_device); mutex_unlock(&fs_info->volume_mutex); if (ret) @@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } - dev_replace->cont_reading_from_srcdev_mode = - args->start.cont_reading_from_srcdev_mode; + dev_replace->cont_reading_from_srcdev_mode = read_src; WARN_ON(!src_device); dev_replace->srcdev = src_device; WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(root->fs_info, + btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), @@ -394,14 +377,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->cursor_right = 0; dev_replace->is_valid = 1; dev_replace->item_needs_writeback = 1; - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); btrfs_dev_replace_unlock(dev_replace, 1); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_err(fs_info, "kobj add dev failed %d\n", ret); - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -419,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root, btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); - ret = btrfs_dev_replace_finishing(root->fs_info, ret); - /* don't warn if EINPROGRESS, someone else might be running scrub */ + ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - ret = 0; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; } else { WARN_ON(ret); } @@ -438,6 +420,35 @@ leave: return ret; } +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + int ret; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + ret = btrfs_dev_replace_start(root, args->start.tgtdev_name, + args->start.srcdevid, + args->start.srcdev_name, + args->start.cont_reading_from_srcdev_mode); + args->result = ret; + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) + ret = 0; + + return ret; +} + /* * blocked until all flighting bios are finished. */ @@ -493,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -558,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; - if (fs_info->sb->s_bdev == src_device->bdev) - fs_info->sb->s_bdev = tgt_device->bdev; - if (fs_info->fs_devices->latest_bdev == src_device->bdev) - fs_info->fs_devices->latest_bdev = tgt_device->bdev; + + btrfs_assign_next_active_device(fs_info, src_device, tgt_device); + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; @@ -624,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( write_unlock(&em_tree->lock); } -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device) -{ - int ret; - - if (srcdevid) { - ret = 0; - *device = btrfs_find_device(root->fs_info, srcdevid, NULL, - NULL); - if (!*device) - ret = -ENOENT; - } else { - ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, - device); - } - return ret; -} - void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 29e3ef5f96bd..e922b42d91df 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_start(struct btrfs_root *root, +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4b02591b0301..91d123938cef 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -25,7 +25,6 @@ #include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> @@ -303,7 +302,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); if (err) - return 1; + return err; cur_len = min(len, map_len - (offset - map_start)); crc = btrfs_csum_data(kaddr + offset - map_start, crc, cur_len); @@ -313,7 +312,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, if (csum_size > sizeof(inline_result)) { result = kzalloc(csum_size, GFP_NOFS); if (!result) - return 1; + return -ENOMEM; } else { result = (char *)&inline_result; } @@ -334,7 +333,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); - return 1; + return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); @@ -513,11 +512,21 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) return 0; + found_start = btrfs_header_bytenr(eb); - if (WARN_ON(found_start != start || !PageUptodate(page))) - return 0; - csum_tree_block(fs_info, eb, 0); - return 0; + /* + * Please do not consolidate these warnings into a single if. + * It is useful to know what went wrong. + */ + if (WARN_ON(found_start != start)) + return -EUCLEAN; + if (WARN_ON(!PageUptodate(page))) + return -EUCLEAN; + + ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, + btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); + + return csum_tree_block(fs_info, eb, 0); } static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, @@ -661,10 +670,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, eb, found_level); ret = csum_tree_block(fs_info, eb, 1); - if (ret) { - ret = -EIO; + if (ret) goto err; - } /* * If this is a leaf block and it is corrupt, set the corrupt bit so @@ -1055,7 +1062,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, (unsigned long long)page_offset(page)); ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } } @@ -1633,7 +1640,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; @@ -1757,7 +1764,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) if (err) return err; - bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; + bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; @@ -1831,7 +1838,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(root->fs_info); sleep: - if (!try_to_freeze() && !again) { + if (!again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1921,14 +1928,12 @@ sleep: if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))) btrfs_cleanup_transaction(root); - if (!try_to_freeze()) { - set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && - (!btrfs_transaction_blocked(root->fs_info) || - cannot_commit)) - schedule_timeout(delay); - __set_current_state(TASK_RUNNING); - } + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop() && + (!btrfs_transaction_blocked(root->fs_info) || + cannot_commit)) + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); } while (!kthread_should_stop()); return 0; } @@ -2412,7 +2417,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_std_error(tree_root->fs_info, ret, + btrfs_handle_fs_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2512,6 +2517,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + bool cleaner_mutex_locked = false; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2537,7 +2543,7 @@ int open_ctree(struct super_block *sb, err = ret; goto fail_bdi; } - fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); @@ -2708,7 +2714,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); + btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); goto fail_alloc; @@ -2728,7 +2734,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); + btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; goto fail_alloc; } @@ -2763,9 +2769,9 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super) & ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { - printk(KERN_ERR "BTRFS: couldn't mount because of " - "unsupported optional features (%Lx).\n", - features); + btrfs_err(fs_info, + "cannot mount because of unsupported optional features (%llx)", + features); err = -EINVAL; goto fail_alloc; } @@ -2776,15 +2782,16 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_INFO "BTRFS: has skinny extents\n"); + btrfs_info(fs_info, "has skinny extents"); /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size */ - if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) { + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); + btrfs_info(fs_info, + "flagging fs with big metadata feature"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } @@ -2800,9 +2807,9 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " - "are not allowed for mixed block groups on %s\n", - sb->s_id); + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + nodesize, sectorsize); goto fail_alloc; } @@ -2815,8 +2822,8 @@ int open_ctree(struct super_block *sb, features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " - "unsupported option features (%Lx).\n", + btrfs_err(fs_info, + "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; goto fail_alloc; @@ -2832,7 +2839,7 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - SZ_4M / PAGE_CACHE_SIZE); + SZ_4M / PAGE_SIZE); tree_root->nodesize = nodesize; tree_root->sectorsize = sectorsize; @@ -2845,8 +2852,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_ERR "BTRFS: failed to read the system " - "array on %s\n", sb->s_id); + btrfs_err(fs_info, "failed to read the system array: %d", ret); goto fail_sb_buffer; } @@ -2860,8 +2866,7 @@ int open_ctree(struct super_block *sb, generation); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { - printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk root"); if (!IS_ERR(chunk_root->node)) free_extent_buffer(chunk_root->node); chunk_root->node = NULL; @@ -2875,8 +2880,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; } @@ -2887,8 +2891,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_ERR "BTRFS: failed to read devices on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -2900,8 +2903,7 @@ retry_root_backup: generation); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { - printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", - sb->s_id); + btrfs_warn(fs_info, "failed to read tree root"); if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; @@ -2933,20 +2935,19 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to recover balance\n"); + btrfs_err(fs_info, "failed to recover balance: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", - ret); + btrfs_err(fs_info, "failed to init dev_stats: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("BTRFS: failed to init dev_replace: %d\n", ret); + btrfs_err(fs_info, "failed to init dev_replace: %d", ret); goto fail_block_groups; } @@ -2954,31 +2955,33 @@ retry_root_backup: ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { - pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", + ret); goto fail_block_groups; } ret = btrfs_sysfs_add_device(fs_devices); if (ret) { - pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs device interface: %d", + ret); goto fail_fsdev_sysfs; } ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { - pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + btrfs_err(fs_info, "failed to initialize space info: %d", ret); goto fail_sysfs; } ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { - printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = @@ -2986,12 +2989,20 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + btrfs_warn(fs_info, +"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed", fs_info->fs_devices->missing_devices, fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; } + /* + * Hold the cleaner_mutex thread here so that we don't block + * for a long time on btrfs_recover_relocation. cleaner_kthread + * will wait for us to finish mounting the filesystem. + */ + mutex_lock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = true; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -3006,8 +3017,7 @@ retry_root_backup: if (!btrfs_test_opt(tree_root, SSD) && !btrfs_test_opt(tree_root, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " - "mode\n"); + btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); } @@ -3025,8 +3035,9 @@ retry_root_backup: 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "BTRFS: failed to initialize" - " integrity check module %s\n", sb->s_id); + btrfs_warn(fs_info, + "failed to initialize integrity check module: %d", + ret); } #endif ret = btrfs_read_qgroup_config(fs_info); @@ -3051,17 +3062,17 @@ retry_root_backup: ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto fail_qgroup; - - mutex_lock(&fs_info->cleaner_mutex); + /* We locked cleaner_mutex before creating cleaner_kthread. */ ret = btrfs_recover_relocation(tree_root); - mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { - printk(KERN_WARNING - "BTRFS: failed to recover relocation\n"); + btrfs_warn(fs_info, "failed to recover relocation: %d", + ret); err = -EINVAL; goto fail_qgroup; } } + mutex_unlock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = false; location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; @@ -3078,11 +3089,11 @@ retry_root_backup: if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: creating free space tree\n"); + btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3099,14 +3110,14 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to resume balance\n"); + btrfs_warn(fs_info, "failed to resume balance: %d", ret); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("BTRFS: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(tree_root); return ret; } @@ -3115,33 +3126,33 @@ retry_root_backup: if (btrfs_test_opt(tree_root, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: clearing free space tree\n"); + btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to clear free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); close_ctree(tree_root); return ret; } } if (!fs_info->uuid_root) { - pr_info("BTRFS: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create the UUID tree: %d", ret); close_ctree(tree_root); return ret; } } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { - pr_info("BTRFS: checking UUID tree\n"); + btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to check the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to check the UUID tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3175,6 +3186,10 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: + if (cleaner_mutex_locked) { + mutex_unlock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = false; + } btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: @@ -3641,7 +3656,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3681,7 +3696,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3699,7 +3714,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -4071,9 +4086,9 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, ret = -EINVAL; } /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_CACHE_SIZE) { + if (sectorsize != PAGE_SIZE) { printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", - sectorsize, PAGE_CACHE_SIZE); + sectorsize, PAGE_SIZE); ret = -EINVAL; } if (!is_power_of_2(nodesize) || nodesize < sectorsize || diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53e12977bfd0..9424864fd01a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3452,7 +3452,7 @@ again: num_pages = 1; num_pages *= 16; - num_pages *= PAGE_CACHE_SIZE; + num_pages *= PAGE_SIZE; ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) @@ -3824,6 +3824,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* no put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_atomic_t(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_on_atomic_t(&bg->nocow_writers, + btrfs_wait_nocow_writers_atomic_t, + TASK_UNINTERRUPTIBLE); +} + static const char *alloc_name(u64 flags) { switch (flags) { @@ -4141,7 +4194,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } trans = btrfs_join_transaction(root); @@ -4583,7 +4636,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, */ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items); + btrfs_wait_ordered_roots(root->fs_info, nr_items, + 0, (u64)-1); } } @@ -4620,7 +4674,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -4632,14 +4686,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); return; } loops = 0; while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); - nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; + nr_pages = max_reclaim >> PAGE_SHIFT; btrfs_writeback_inodes_sb_nr(root, nr_pages, items); /* * We need to wait for the async pages to actually start before @@ -4671,7 +4726,8 @@ skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -6172,6 +6228,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, return 0; } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_atomic_t(&bg->reservations); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_on_atomic_t(&bg->reservations, + btrfs_wait_bg_reservations_atomic_t, + TASK_UNINTERRUPTIBLE); +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -7025,36 +7132,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, int delalloc) { struct btrfs_block_group_cache *used_bg = NULL; - bool locked = false; -again: + spin_lock(&cluster->refill_lock); - if (locked) { - if (used_bg == cluster->block_group) + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) return used_bg; - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); - } + btrfs_get_block_group(used_bg); - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + if (!delalloc) + return used_bg; - if (used_bg == block_group) - return used_bg; + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; - btrfs_get_block_group(used_bg); + spin_unlock(&cluster->refill_lock); - if (!delalloc) - return used_bg; + down_read(&used_bg->data_rwsem); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; - spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); - locked = true; - goto again; + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } } static inline void @@ -7431,6 +7537,7 @@ checks: btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } + btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = search_start; @@ -7612,8 +7719,10 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, flags, delalloc); - - if (ret == -ENOSPC) { + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(root->fs_info, + ins->objectid); + } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); @@ -9058,7 +9167,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err, NULL); + btrfs_handle_fs_error(root->fs_info, err, NULL); return err; } @@ -9386,15 +9495,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 dev_min = 1; u64 dev_nr = 0; u64 target; + int debug; int index; int full = 0; int ret = 0; + debug = btrfs_test_opt(root, ENOSPC_DEBUG); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); /* odd, couldn't find the block group, leave it alone */ - if (!block_group) + if (!block_group) { + if (debug) + btrfs_warn(root->fs_info, + "can't find block group for bytenr %llu", + bytenr); return -1; + } min_free = btrfs_block_group_used(&block_group->item); @@ -9448,8 +9565,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * this is just a balance, so if we were marked as full * we know there is no space for a new chunk */ - if (full) + if (full) { + if (debug) + btrfs_warn(root->fs_info, + "no space to alloc new chunk for block group %llu", + block_group->key.objectid); goto out; + } index = get_block_group_index(block_group); } @@ -9496,6 +9618,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = -1; } } + if (debug && ret == -1) + btrfs_warn(root->fs_info, + "no space to allocate a new chunk for block group %llu", + block_group->key.objectid); mutex_unlock(&root->fs_info->chunk_mutex); btrfs_end_transaction(trans, root); out: diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 76a0c8597d98..2f83448d34fe 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1363,23 +1363,23 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; struct page *page; while (index <= end_index) { page = find_get_page(inode->i_mapping, index); BUG_ON(!page); /* Pages should be in the extent_io_tree */ clear_page_dirty_for_io(page); - page_cache_release(page); + put_page(page); index++; } } void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; struct page *page; while (index <= end_index) { @@ -1387,7 +1387,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) BUG_ON(!page); /* Pages should be in the extent_io_tree */ __set_page_dirty_nobuffers(page); account_page_redirty(page); - page_cache_release(page); + put_page(page); index++; } } @@ -1397,15 +1397,15 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) */ static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; struct page *page; while (index <= end_index) { page = find_get_page(tree->mapping, index); BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); - page_cache_release(page); + put_page(page); index++; } } @@ -1556,8 +1556,8 @@ static noinline void __unlock_for_delalloc(struct inode *inode, { int ret; struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; @@ -1571,7 +1571,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode, for (i = 0; i < ret; i++) { if (pages[i] != locked_page) unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -1584,9 +1584,9 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 delalloc_start, u64 delalloc_end) { - unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long index = delalloc_start >> PAGE_SHIFT; unsigned long start_index = index; - unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long end_index = delalloc_end >> PAGE_SHIFT; unsigned long pages_locked = 0; struct page *pages[16]; unsigned long nrpages; @@ -1619,11 +1619,11 @@ static noinline int lock_delalloc_pages(struct inode *inode, pages[i]->mapping != inode->i_mapping) { ret = -EAGAIN; unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); goto done; } } - page_cache_release(pages[i]); + put_page(pages[i]); pages_locked++; } nrpages -= ret; @@ -1636,7 +1636,7 @@ done: __unlock_for_delalloc(inode, locked_page, delalloc_start, ((u64)(start_index + pages_locked - 1)) << - PAGE_CACHE_SHIFT); + PAGE_SHIFT); } return ret; } @@ -1696,7 +1696,7 @@ again: free_extent_state(cached_state); cached_state = NULL; if (!loops) { - max_bytes = PAGE_CACHE_SIZE; + max_bytes = PAGE_SIZE; loops = 1; goto again; } else { @@ -1735,8 +1735,8 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; int ret; struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; @@ -1757,7 +1757,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { - page_cache_release(pages[i]); + put_page(pages[i]); continue; } if (page_ops & PAGE_CLEAR_DIRTY) @@ -1770,7 +1770,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, end_page_writeback(pages[i]); if (page_ops & PAGE_UNLOCK) unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -1961,7 +1961,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); } @@ -2071,11 +2071,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, struct page *p = eb->pages[i]; ret = repair_io_failure(root->fs_info->btree_inode, start, - PAGE_CACHE_SIZE, start, p, + PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) break; - start += PAGE_CACHE_SIZE; + start += PAGE_SIZE; } return ret; @@ -2466,8 +2466,8 @@ static void end_bio_extent_writepage(struct bio *bio) * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, "partial page write in btrfs with offset %u and length %u", bvec->bv_offset, bvec->bv_len); @@ -2541,8 +2541,8 @@ static void end_bio_extent_readpage(struct bio *bio) * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, "partial page read in btrfs with offset %u and length %u", bvec->bv_offset, bvec->bv_len); @@ -2598,13 +2598,13 @@ static void end_bio_extent_readpage(struct bio *bio) readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + pgoff_t end_index = i_size >> PAGE_SHIFT; unsigned off; /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_CACHE_SIZE-1); + off = i_size & (PAGE_SIZE-1); if (page->index == end_index && off) - zero_user_segment(page, off, PAGE_CACHE_SIZE); + zero_user_segment(page, off, PAGE_SIZE); SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -2768,7 +2768,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, struct bio *bio; int contig = 0; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; - size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + size_t page_size = min_t(size_t, size, PAGE_SIZE); if (bio_ret && *bio_ret) { bio = *bio_ret; @@ -2821,7 +2821,7 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, { if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); set_page_private(page, (unsigned long)eb); } else { WARN_ON(page->private != (unsigned long)eb); @@ -2832,7 +2832,7 @@ void set_page_extent_mapped(struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); set_page_private(page, EXTENT_PAGE_PRIVATE); } } @@ -2880,7 +2880,7 @@ static int __do_readpage(struct extent_io_tree *tree, { struct inode *inode = page->mapping->host; u64 start = page_offset(page); - u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 page_end = start + PAGE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; @@ -2909,12 +2909,12 @@ static int __do_readpage(struct extent_io_tree *tree, } } - if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + if (page->index == last_byte >> PAGE_SHIFT) { char *userpage; - size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + size_t zero_offset = last_byte & (PAGE_SIZE - 1); if (zero_offset) { - iosize = PAGE_CACHE_SIZE - zero_offset; + iosize = PAGE_SIZE - zero_offset; userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, iosize); flush_dcache_page(page); @@ -2922,14 +2922,14 @@ static int __do_readpage(struct extent_io_tree *tree, } } while (cur <= end) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1; bool force_bio_submit = false; if (cur >= last_byte) { char *userpage; struct extent_state *cached = NULL; - iosize = PAGE_CACHE_SIZE - pg_offset; + iosize = PAGE_SIZE - pg_offset; userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); @@ -3112,7 +3112,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, mirror_num, bio_flags, rw, prev_em_start); - page_cache_release(pages[index]); + put_page(pages[index]); } } @@ -3134,10 +3134,10 @@ static void __extent_readpages(struct extent_io_tree *tree, page_start = page_offset(pages[index]); if (!end) { start = page_start; - end = start + PAGE_CACHE_SIZE - 1; + end = start + PAGE_SIZE - 1; first_index = index; } else if (end + 1 == page_start) { - end += PAGE_CACHE_SIZE; + end += PAGE_SIZE; } else { __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, @@ -3145,7 +3145,7 @@ static void __extent_readpages(struct extent_io_tree *tree, bio, mirror_num, bio_flags, rw, prev_em_start); start = page_start; - end = start + PAGE_CACHE_SIZE - 1; + end = start + PAGE_SIZE - 1; first_index = index; } } @@ -3167,13 +3167,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct inode *inode = page->mapping->host; struct btrfs_ordered_extent *ordered; u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; int ret; while (1) { lock_extent(tree, start, end); ordered = btrfs_lookup_ordered_range(inode, start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (!ordered) break; unlock_extent(tree, start, end); @@ -3200,14 +3200,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -static noinline void update_nr_written(struct page *page, - struct writeback_control *wbc, - unsigned long nr_written) +static void update_nr_written(struct page *page, struct writeback_control *wbc, + unsigned long nr_written) { wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; } /* @@ -3227,7 +3223,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, unsigned long *nr_written) { struct extent_io_tree *tree = epd->tree; - u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 page_end = delalloc_start + PAGE_SIZE - 1; u64 nr_delalloc; u64 delalloc_to_write = 0; u64 delalloc_end = 0; @@ -3264,13 +3260,11 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, goto done; } /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE + * delalloc_end is already one less than the total length, so + * we don't subtract one from PAGE_SIZE */ delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + PAGE_SIZE) >> PAGE_SHIFT; delalloc_start = delalloc_end + 1; } if (wbc->nr_to_write < delalloc_to_write) { @@ -3319,7 +3313,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, { struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); - u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 page_end = start + PAGE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; @@ -3370,6 +3364,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + unsigned long max_nr; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, @@ -3425,32 +3421,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, continue; } - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; + max_nr = (i_size >> PAGE_SHIFT) + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); } - if (ret) { - SetPageError(page); - } else { - unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } + ret = submit_extent_page(write_flags, tree, wbc, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0, false); + if (ret) + SetPageError(page); - ret = submit_extent_page(write_flags, tree, wbc, page, - sector, iosize, pg_offset, - bdev, &epd->bio, max_nr, - end_bio_extent_writepage, - 0, 0, 0, false); - if (ret) - SetPageError(page); - } cur = cur + iosize; pg_offset += iosize; nr++; @@ -3477,12 +3464,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct inode *inode = page->mapping->host; struct extent_page_data *epd = data; u64 start = page_offset(page); - u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 page_end = start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset = 0; loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned long end_index = i_size >> PAGE_SHIFT; int write_flags; unsigned long nr_written = 0; @@ -3497,10 +3484,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ClearPageError(page); - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + pg_offset = i_size & (PAGE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; } @@ -3510,7 +3497,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); + PAGE_SIZE - pg_offset); kunmap_atomic(userpage); flush_dcache_page(page); } @@ -3748,7 +3735,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, - PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + PAGE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; @@ -3760,7 +3747,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = -EIO; break; } - offset += PAGE_CACHE_SIZE; + offset += PAGE_SIZE; update_nr_written(p, wbc, 1); unlock_page(p); } @@ -3804,8 +3791,8 @@ int btree_write_cache_pages(struct address_space *mapping, index = mapping->writeback_index; /* Start from prev offset */ end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3922,12 +3909,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; - int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; int scanned = 0; int tag; @@ -3948,8 +3936,10 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, index = mapping->writeback_index; /* Start from prev offset */ end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3959,6 +3949,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); + done_index = index; while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -3968,6 +3959,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + done_index = page->index; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -4009,8 +4001,20 @@ retry: unlock_page(page); ret = 0; } - if (!err && ret < 0) - err = ret; + if (ret < 0) { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } /* * the filesystem may choose to bump up nr_to_write. @@ -4022,7 +4026,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done && !err) { + if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -4031,8 +4035,12 @@ retry: index = 0; goto retry; } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) + mapping->writeback_index = done_index; + btrfs_add_delayed_iput(inode); - return err; + return ret; } static void flush_epd_write_bio(struct extent_page_data *epd) @@ -4083,8 +4091,8 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, int ret = 0; struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + unsigned long nr_pages = (end - start + PAGE_SIZE) >> + PAGE_SHIFT; struct extent_page_data epd = { .bio = NULL, @@ -4102,18 +4110,18 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; while (start <= end) { - page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + page = find_get_page(mapping, start >> PAGE_SHIFT); if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, - start + PAGE_CACHE_SIZE - 1, + start + PAGE_SIZE - 1, NULL, 1); unlock_page(page); } - page_cache_release(page); - start += PAGE_CACHE_SIZE; + put_page(page); + start += PAGE_SIZE; } flush_epd_write_bio(&epd); @@ -4163,7 +4171,7 @@ int extent_readpages(struct extent_io_tree *tree, list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { - page_cache_release(page); + put_page(page); continue; } @@ -4197,7 +4205,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, { struct extent_state *cached_state = NULL; u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; start += ALIGN(offset, blocksize); @@ -4223,7 +4231,7 @@ static int try_release_extent_state(struct extent_map_tree *map, struct page *page, gfp_t mask) { u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; int ret = 1; if (test_range_bit(tree, start, end, @@ -4262,7 +4270,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, { struct extent_map *em; u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > SZ_16M) { @@ -4587,14 +4595,14 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) ClearPagePrivate(page); set_page_private(page, 0); /* One for the page private */ - page_cache_release(page); + put_page(page); } if (mapped) spin_unlock(&page->mapping->private_lock); /* One for when we alloced the page */ - page_cache_release(page); + put_page(page); } while (index != 0); } @@ -4779,7 +4787,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, rcu_read_lock(); eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); /* @@ -4824,12 +4832,12 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) goto free_eb; spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_CACHE_SHIFT, eb); + start >> PAGE_SHIFT, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -4862,7 +4870,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, unsigned long len = fs_info->tree_root->nodesize; unsigned long num_pages = num_extent_pages(start, len); unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; struct extent_buffer *eb; struct extent_buffer *exists = NULL; struct page *p; @@ -4896,7 +4904,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (atomic_inc_not_zero(&exists->refs)) { spin_unlock(&mapping->private_lock); unlock_page(p); - page_cache_release(p); + put_page(p); mark_extent_buffer_accessed(exists, p); goto free_eb; } @@ -4908,7 +4916,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, */ ClearPagePrivate(p); WARN_ON(PageDirty(p)); - page_cache_release(p); + put_page(p); } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); @@ -4925,13 +4933,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) goto free_eb; spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_CACHE_SHIFT, eb); + start >> PAGE_SHIFT, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -4994,7 +5002,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_lock(&fs_info->buffer_lock); radix_tree_delete(&fs_info->buffer_radix, - eb->start >> PAGE_CACHE_SHIFT); + eb->start >> PAGE_SHIFT); spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); @@ -5168,8 +5176,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (start) { WARN_ON(start < eb->start); - start_i = (start >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT); + start_i = (start >> PAGE_SHIFT) - + (eb->start >> PAGE_SHIFT); } else { start_i = 0; } @@ -5252,18 +5260,18 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; - cur = min(len, (PAGE_CACHE_SIZE - offset)); + cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); @@ -5283,19 +5291,19 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; - cur = min(len, (PAGE_CACHE_SIZE - offset)); + cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); if (copy_to_user(dst, kaddr + offset, cur)) { ret = -EFAULT; @@ -5316,13 +5324,13 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long *map_start, unsigned long *map_len) { - size_t offset = start & (PAGE_CACHE_SIZE - 1); + size_t offset = start & (PAGE_SIZE - 1); char *kaddr; struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; if (i != end_i) return -EINVAL; @@ -5332,7 +5340,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, *map_start = 0; } else { offset = 0; - *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + *map_start = ((u64)i << PAGE_SHIFT) - start_offset; } if (start + min_len > eb->len) { @@ -5345,7 +5353,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, p = eb->pages[i]; kaddr = page_address(p); *map = kaddr + offset; - *map_len = PAGE_CACHE_SIZE - offset; + *map_len = PAGE_SIZE - offset; return 0; } @@ -5358,19 +5366,19 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; - cur = min(len, (PAGE_CACHE_SIZE - offset)); + cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); @@ -5393,19 +5401,19 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; WARN_ON(!PageUptodate(page)); - cur = min(len, PAGE_CACHE_SIZE - offset); + cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); memcpy(kaddr + offset, src, cur); @@ -5423,19 +5431,19 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, size_t offset; struct page *page; char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; WARN_ON(!PageUptodate(page)); - cur = min(len, PAGE_CACHE_SIZE - offset); + cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); memset(kaddr + offset, c, cur); @@ -5454,19 +5462,19 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, size_t offset; struct page *page; char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; WARN_ON(src->len != dst_len); offset = (start_offset + dst_offset) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); while (len > 0) { page = dst->pages[i]; WARN_ON(!PageUptodate(page)); - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + cur = min(len, (unsigned long)(PAGE_SIZE - offset)); kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); @@ -5508,7 +5516,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5519,8 +5527,8 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, */ offset = start_offset + start + byte_offset; - *page_index = offset >> PAGE_CACHE_SHIFT; - *page_offset = offset & (PAGE_CACHE_SIZE - 1); + *page_index = offset >> PAGE_SHIFT; + *page_offset = offset & (PAGE_SIZE - 1); } /** @@ -5572,7 +5580,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, len -= bits_to_set; bits_to_set = BITS_PER_BYTE; mask_to_set = ~0U; - if (++offset >= PAGE_CACHE_SIZE && len > 0) { + if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; WARN_ON(!PageUptodate(page)); @@ -5614,7 +5622,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; mask_to_clear = ~0U; - if (++offset >= PAGE_CACHE_SIZE && len > 0) { + if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; WARN_ON(!PageUptodate(page)); @@ -5661,7 +5669,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); unsigned long dst_i; unsigned long src_i; @@ -5680,17 +5688,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, while (len > 0) { dst_off_in_page = (start_offset + dst_offset) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); src_off_in_page = (start_offset + src_offset) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); - dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_SHIFT; - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + (unsigned long)(PAGE_SIZE - dst_off_in_page)); copy_pages(dst->pages[dst_i], dst->pages[src_i], dst_off_in_page, src_off_in_page, cur); @@ -5709,7 +5717,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); unsigned long dst_i; unsigned long src_i; @@ -5728,13 +5736,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, return; } while (len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + dst_i = (start_offset + dst_end) >> PAGE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_SHIFT; dst_off_in_page = (start_offset + dst_end) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); src_off_in_page = (start_offset + src_end) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5dbf92e68fbd..981f402bf754 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -71,7 +71,6 @@ struct extent_io_ops { u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, @@ -120,7 +119,7 @@ struct extent_state { }; #define INLINE_EXTENT_BUFFER_PAGES 16 -#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { u64 start; unsigned long len; @@ -365,8 +364,8 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb); static inline unsigned long num_extent_pages(u64 start, u64 len) { - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); + return ((start + len + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (start >> PAGE_SHIFT); } static inline void extent_buffer_get(struct extent_buffer *eb) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b5baf5bdc8e1..7a7d6e253cfc 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -32,7 +32,7 @@ size) - 1)) #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ - PAGE_CACHE_SIZE)) + PAGE_SIZE)) #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ @@ -203,7 +203,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, csum = (u8 *)dst; } - if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) + if (bio->bi_iter.bi_size > PAGE_SIZE * 8) path->reada = READA_FORWARD; WARN_ON(bio->bi_vcnt <= 0); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15a09cb156ce..c98805c35bab 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -414,11 +414,11 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, size_t copied = 0; size_t total_copied = 0; int pg = 0; - int offset = pos & (PAGE_CACHE_SIZE - 1); + int offset = pos & (PAGE_SIZE - 1); while (write_bytes > 0) { size_t count = min_t(size_t, - PAGE_CACHE_SIZE - offset, write_bytes); + PAGE_SIZE - offset, write_bytes); struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page @@ -448,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, if (unlikely(copied == 0)) break; - if (copied < PAGE_CACHE_SIZE - offset) { + if (copied < PAGE_SIZE - offset) { offset += copied; } else { pg++; @@ -473,7 +473,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } } @@ -1297,7 +1297,7 @@ static int prepare_uptodate_page(struct inode *inode, { int ret = 0; - if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && + if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && !PageUptodate(page)) { ret = btrfs_readpage(NULL, page); if (ret) @@ -1323,7 +1323,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, size_t write_bytes, bool force_uptodate) { int i; - unsigned long index = pos >> PAGE_CACHE_SHIFT; + unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili; @@ -1345,7 +1345,7 @@ again: err = prepare_uptodate_page(inode, pages[i], pos + write_bytes, false); if (err) { - page_cache_release(pages[i]); + put_page(pages[i]); if (err == -EAGAIN) { err = 0; goto again; @@ -1360,7 +1360,7 @@ again: fail: while (faili >= 0) { unlock_page(pages[faili]); - page_cache_release(pages[faili]); + put_page(pages[faili]); faili--; } return err; @@ -1408,7 +1408,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); @@ -1497,8 +1497,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, bool force_page_uptodate = false; bool need_unlock; - nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), - PAGE_CACHE_SIZE / (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), + PAGE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); @@ -1506,13 +1506,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, return -ENOMEM; while (iov_iter_count(i) > 0) { - size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t offset = pos & (PAGE_SIZE - 1); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_CACHE_SIZE - + nrptrs * (size_t)PAGE_SIZE - offset); size_t num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1547,7 +1547,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * write_bytes, so scale down. */ num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); reserve_bytes = round_up(write_bytes + sector_offset, root->sectorsize); goto reserve_metadata; @@ -1609,7 +1609,7 @@ again: } else { force_page_uptodate = false; dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); } /* @@ -1641,7 +1641,7 @@ again: u64 __pos; __pos = round_down(pos, root->sectorsize) + - (dirty_pages << PAGE_CACHE_SHIFT); + (dirty_pages << PAGE_SHIFT); btrfs_delalloc_release_space(inode, __pos, release_bytes); } @@ -1682,7 +1682,7 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; @@ -1696,25 +1696,26 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, pos, release_bytes); + btrfs_delalloc_release_space(inode, + round_down(pos, root->sectorsize), + release_bytes); } } return num_written ? num_written : ret; } -static ssize_t __btrfs_direct_write(struct kiocb *iocb, - struct iov_iter *from, - loff_t pos) +static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + loff_t pos = iocb->ki_pos; ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from, pos); + written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -1738,8 +1739,8 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, goto out; written += written_buffered; iocb->ki_pos = pos + written_buffered; - invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); out: return written ? written : err; } @@ -1832,7 +1833,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { - num_written = __btrfs_direct_write(iocb, from, pos); + num_written = __btrfs_direct_write(iocb, from); } else { num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) @@ -1852,11 +1853,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; spin_unlock(&BTRFS_I(inode)->lock); - if (num_written > 0) { - err = generic_write_sync(file, pos, num_written); - if (err < 0) - num_written = err; - } + if (num_written > 0) + num_written = generic_write_sync(iocb, num_written); if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); @@ -1905,7 +1903,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -2682,9 +2680,12 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; inode_lock(inode); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out; + } /* * TODO: Move these two operations after we have checked @@ -2953,7 +2954,7 @@ const struct file_operations btrfs_file_operations = { .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8f835bfa1bdd..5e6062c26129 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,7 +29,7 @@ #include "inode-map.h" #include "volumes.h" -#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { @@ -295,7 +295,7 @@ static int readahead_cache(struct inode *inode) return -ENOMEM; file_ra_state_init(ra, inode->i_mapping); - last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); @@ -310,14 +310,14 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, int num_pages; int check_crcs = 0; - num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) check_crcs = 1; /* Make sure we can fit our crcs into the first page */ if (write && check_crcs && - (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + (num_pages * sizeof(u32)) >= PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); @@ -354,9 +354,9 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; - io_ctl->size = PAGE_CACHE_SIZE; + io_ctl->size = PAGE_SIZE; if (clear) - memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); + memset(io_ctl->cur, 0, PAGE_SIZE); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) @@ -369,7 +369,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) if (io_ctl->pages[i]) { ClearPageChecked(io_ctl->pages[i]); unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); + put_page(io_ctl->pages[i]); } } } @@ -475,7 +475,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) offset = sizeof(u32) * io_ctl->num_pages; crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); @@ -503,7 +503,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->root->fs_info, @@ -561,7 +561,7 @@ static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) io_ctl_map_page(io_ctl, 0); } - memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + memcpy(io_ctl->cur, bitmap, PAGE_SIZE); io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index < io_ctl->num_pages) io_ctl_map_page(io_ctl, 0); @@ -621,7 +621,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, if (ret) return ret; - memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + memcpy(entry->bitmap, io_ctl->cur, PAGE_SIZE); io_ctl_unmap_page(io_ctl); return 0; @@ -775,7 +775,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } else { ASSERT(num_bitmaps); num_bitmaps--; - e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); if (!e->bitmap) { kmem_cache_free( btrfs_free_space_cachep, e); @@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as * we add more bitmaps. */ - bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; + bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE; if (bitmap_bytes >= max_bytes) { ctl->extents_thresh = 0; @@ -2111,7 +2111,7 @@ new_bitmap: } /* allocate the bitmap */ - info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -3580,7 +3580,7 @@ again: } if (!map) { - map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + map = kzalloc(PAGE_SIZE, GFP_NOFS); if (!map) { kmem_cache_free(btrfs_free_space_cachep, info); return -ENOMEM; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index be4d22a5022f..b8acc07ac6c2 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT, NULL); + btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 1f0ec19b23f6..70107f7c9307 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -283,7 +283,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) } #define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) -#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define INODES_PER_BITMAP (PAGE_SIZE * 8) /* * The goal is to keep the memory used by the free_ino tree won't @@ -317,7 +317,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) } ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * - PAGE_CACHE_SIZE / sizeof(*info); + PAGE_SIZE / sizeof(*info); } /* @@ -481,12 +481,12 @@ again: spin_lock(&ctl->tree_lock); prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; - prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE); - prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE; + prealloc = ALIGN(prealloc, PAGE_SIZE); + prealloc += ctl->total_bitmaps * PAGE_SIZE; spin_unlock(&ctl->tree_lock); /* Just to make sure we have enough space */ - prealloc += 8 * PAGE_CACHE_SIZE; + prealloc += 8 * PAGE_SIZE; ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); if (ret) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41a5688ffdfe..91419ef79b00 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -194,7 +194,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, while (compressed_size > 0) { cpage = compressed_pages[i]; cur_size = min_t(unsigned long, compressed_size, - PAGE_CACHE_SIZE); + PAGE_SIZE); kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); @@ -208,13 +208,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, compress_type); } else { page = find_get_page(inode->i_mapping, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_CACHE_SIZE - 1); + offset = start & (PAGE_SIZE - 1); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); - page_cache_release(page); + put_page(page); } btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -322,7 +322,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -435,8 +435,8 @@ static noinline void compress_file_range(struct inode *inode, actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; - nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -514,7 +514,7 @@ again: if (!ret) { unsigned long offset = total_compressed & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); struct page *page = pages[nr_pages_ret - 1]; char *kaddr; @@ -524,7 +524,7 @@ again: if (offset) { kaddr = kmap_atomic(page); memset(kaddr + offset, 0, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); kunmap_atomic(kaddr); } will_compress = 1; @@ -580,7 +580,7 @@ cont: * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = ALIGN(total_in, PAGE_CACHE_SIZE); + total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -594,7 +594,7 @@ cont: */ for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); pages = NULL; @@ -650,7 +650,7 @@ cleanup_and_bail_uncompressed: free_pages_out: for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); } @@ -664,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); + put_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; @@ -824,6 +824,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -861,6 +862,7 @@ retry: } return; out_free_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, @@ -966,7 +968,7 @@ static noinline int cow_file_range(struct inode *inode, PAGE_END_WRITEBACK); *nr_written = *nr_written + - (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; goto out; } else if (ret < 0) { @@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (disk_num_bytes < cur_alloc_size) break; @@ -1066,6 +1070,7 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -1106,8 +1111,8 @@ static noinline void async_cow_submit(struct btrfs_work *work) async_cow = container_of(work, struct async_cow, work); root = async_cow->root; - nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> + PAGE_SHIFT; /* * atomic_sub_return implies a barrier for waitqueue_active @@ -1164,8 +1169,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow_start, async_cow_submit, async_cow_free); - nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (cur_end - start + PAGE_SIZE) >> + PAGE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); btrfs_queue_work(root->fs_info->delalloc_workers, @@ -1377,6 +1382,9 @@ next_slot: */ if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out_check; + if (!btrfs_inc_nocow_writers(root->fs_info, + disk_bytenr)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1391,6 +1399,9 @@ out_check: path->slots[0]++; if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto next_slot; } if (!nocow) { @@ -1411,6 +1422,9 @@ out_check: if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto error; } cow_start = (u64)-1; @@ -1453,6 +1467,8 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -1960,7 +1976,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); + WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, cached_state, GFP_NOFS); } @@ -1993,7 +2009,7 @@ again: inode = page->mapping->host; page_start = page_offset(page); - page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + page_end = page_offset(page) + PAGE_SIZE - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -2003,7 +2019,7 @@ again: goto out; ordered = btrfs_lookup_ordered_range(inode, page_start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -2014,7 +2030,7 @@ again: } ret = btrfs_delalloc_reserve_space(inode, page_start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2030,7 +2046,7 @@ out: &cached_state, GFP_NOFS); out_page: unlock_page(page); - page_cache_release(page); + put_page(page); kfree(fixup); } @@ -2063,7 +2079,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) return -EAGAIN; SetPageChecked(page); - page_cache_get(page); + get_page(page); btrfs_init_work(&fixup->work, btrfs_fixup_helper, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; @@ -4247,7 +4263,7 @@ static int truncate_inline_extent(struct inode *inode, if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { loff_t offset = new_size; - loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + loff_t page_end = ALIGN(offset, PAGE_SIZE); /* * Zero out the remaining of the last page of our inline extent, @@ -4633,7 +4649,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; char *kaddr; u32 blocksize = root->sectorsize; - pgoff_t index = from >> PAGE_CACHE_SHIFT; + pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); @@ -4668,7 +4684,7 @@ again: lock_page(page); if (page->mapping != mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } if (!PageUptodate(page)) { @@ -4686,7 +4702,7 @@ again: unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); unlock_page(page); - page_cache_release(page); + put_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; @@ -4728,7 +4744,7 @@ out_unlock: btrfs_delalloc_release_space(inode, block_start, blocksize); unlock_page(page); - page_cache_release(page); + put_page(page); out: return ret; } @@ -6717,7 +6733,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); kfree(tmp); @@ -6879,8 +6895,8 @@ next: size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, - size - extent_offset); + copy_size = min_t(u64, PAGE_SIZE - pg_offset, + size - extent_offset); em->start = extent_start + extent_offset; em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; @@ -6899,9 +6915,9 @@ next: map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); - if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + if (pg_offset + copy_size < PAGE_SIZE) { memset(map + pg_offset + copy_size, 0, - PAGE_CACHE_SIZE - pg_offset - + PAGE_SIZE - pg_offset - copy_size); } kunmap(page); @@ -7129,6 +7145,43 @@ out: return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7144,41 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } return em; } @@ -7336,12 +7361,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int start_idx; int end_idx; - start_idx = start >> PAGE_CACHE_SHIFT; + start_idx = start >> PAGE_SHIFT; /* * end is the last byte in the last page. end == start is legal */ - end_idx = end >> PAGE_CACHE_SHIFT; + end_idx = end >> PAGE_SHIFT; rcu_read_lock(); @@ -7382,7 +7407,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - page_cache_release(page); + put_page(page); page = NULL; } } @@ -7390,7 +7415,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) if (page) { if (page->index <= end_idx) found = true; - page_cache_release(page); + put_page(page); } rcu_read_unlock(); @@ -7650,24 +7675,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; + + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(root->fs_info, block_start); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } + em = em2; } - - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); - if (ret) { - free_extent_map(em); + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); goto unlock_err; } goto unlock; @@ -8541,13 +8563,13 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_data dio_data = { 0 }; + loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8607,7 +8629,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ret = __blockdev_direct_IO(iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iter, offset, btrfs_get_blocks_direct, NULL, + iter, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; @@ -8719,7 +8741,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } return ret; } @@ -8739,7 +8761,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; int inode_evicting = inode->i_state & I_FREEING; @@ -8822,7 +8844,7 @@ again: * 2) Not written to disk * This means the reserved space should be freed here. */ - btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8837,7 +8859,7 @@ again: if (PagePrivate(page)) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } } @@ -8874,11 +8896,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; u64 end; - reserved_space = PAGE_CACHE_SIZE; + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; end = page_end; /* @@ -8934,15 +8956,15 @@ again: goto again; } - if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) { + if (page->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, root->sectorsize); - if (reserved_space < PAGE_CACHE_SIZE) { + if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, page_start, - PAGE_CACHE_SIZE - reserved_space); + PAGE_SIZE - reserved_space); } } @@ -8969,14 +8991,14 @@ again: ret = 0; /* page is wholly or partially inside EOF */ - if (page_start + PAGE_CACHE_SIZE > size) - zero_start = size & ~PAGE_CACHE_MASK; + if (page_start + PAGE_SIZE > size) + zero_start = size & ~PAGE_MASK; else - zero_start = PAGE_CACHE_SIZE; + zero_start = PAGE_SIZE; - if (zero_start != PAGE_CACHE_SIZE) { + if (zero_start != PAGE_SIZE) { kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); flush_dcache_page(page); kunmap(page); } @@ -9230,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } @@ -9387,10 +9410,281 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +static int btrfs_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + struct dentry *parent; + u64 old_ino = btrfs_ino(old_inode); + u64 new_ino = btrfs_ino(new_inode); + u64 old_idx = 0; + u64 new_idx = 0; + u64 root_objectid; + int ret; + bool root_log_pinned = false; + bool dest_log_pinned = false; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&dest->fs_info->subvol_sem); + + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 12); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + /* + * We need to find a free sequence number both in the source and + * in the destination directory for the exchange. + */ + ret = btrfs_set_inode_index(new_dir, &old_idx); + if (ret) + goto out_fail; + ret = btrfs_set_inode_index(old_dir, &new_idx); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + BTRFS_I(new_inode)->dir_index = 0ULL; + + /* Reference for the source. */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(root->fs_info, trans); + } else { + btrfs_pin_log_trans(root); + root_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), old_idx); + if (ret) + goto out_fail; + } + + /* And now for the dest. */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(dest->fs_info, trans); + } else { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, root, + old_dentry->d_name.name, + old_dentry->d_name.len, + new_ino, + btrfs_ino(old_dir), new_idx); + if (ret) + goto out_fail; + } + + /* Update inode version and ctime/mtime. */ + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + inode_inc_iversion(new_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + new_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) { + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); + } + + /* src is a subvolume */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, + root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { /* src is an inode */ + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + /* dest is a subvolume */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + } else { /* dest is an inode */ + ret = __btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, dest, new_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, old_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, old_dir, new_inode, + old_dentry->d_name.name, + old_dentry->d_name.len, 0, new_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = old_idx; + if (new_inode->i_nlink == 1) + BTRFS_I(new_inode)->dir_index = new_idx; + + if (root_log_pinned) { + parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + parent = old_dentry->d_parent; + btrfs_log_new_name(trans, new_inode, new_dir, parent); + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } +out_fail: + /* + * If we have pinned a log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && (root_log_pinned || dest_log_pinned)) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + if (root_log_pinned) { + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } + } + ret = btrfs_end_transaction(trans, root); +out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&dest->fs_info->subvol_sem); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry) +{ + int ret; + struct inode *inode; + u64 objectid; + u64 index; + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + return ret; + + inode = btrfs_new_inode(trans, root, dir, + dentry->d_name.name, + dentry->d_name.len, + btrfs_ino(dir), + objectid, + S_IFCHR | WHITEOUT_MODE, + &index); + + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + return ret; + } + + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, + WHITEOUT_DEV); + + ret = btrfs_init_inode_security(trans, inode, dir, + &dentry->d_name); + if (ret) + goto out; + + ret = btrfs_add_nondir(trans, dir, dentry, + inode, 0, index); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); +out: + unlock_new_inode(inode); + if (ret) + inode_dec_link_count(inode); + iput(inode); + + return ret; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); @@ -9399,6 +9693,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 root_objectid; int ret; u64 old_ino = btrfs_ino(old_inode); + bool log_pinned = false; if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9449,15 +9744,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal + * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. + * If our rename has the whiteout flag, we need more 5 units for the + * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item + * when selinux is enabled). */ - trans = btrfs_start_transaction(root, 11); + trans_num_items = 11; + if (flags & RENAME_WHITEOUT) + trans_num_items += 5; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_notrans; - } + ret = PTR_ERR(trans); + goto out_notrans; + } if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -9471,6 +9772,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(root->fs_info, trans); } else { + btrfs_pin_log_trans(root); + log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9478,14 +9781,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_ino(new_dir), index); if (ret) goto out_fail; - /* - * this is an ugly little race, but the rename is required - * to make sure that if we crash, the inode is either at the - * old name or the new one. pinning the log transaction lets - * us make sure we don't allow a log commit to come in after - * we unlink the name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); } inode_inc_iversion(old_dir); @@ -9552,12 +9847,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (log_pinned) { struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); + log_pinned = false; + } + + if (flags & RENAME_WHITEOUT) { + ret = btrfs_whiteout_for_rename(trans, root, old_dir, + old_dentry); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } } out_fail: + /* + * If we have pinned the log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && log_pinned) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + btrfs_end_log_trans(root); + log_pinned = false; + } btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9570,10 +9899,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); + if (flags & RENAME_EXCHANGE) + return btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } static void btrfs_run_delalloc_work(struct btrfs_work *work) @@ -9942,6 +10275,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, @@ -10160,10 +10494,10 @@ static const struct inode_operations btrfs_dir_inode_operations = { .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, @@ -10184,7 +10518,7 @@ static const struct file_operations btrfs_dir_file_operations = { .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, @@ -10237,10 +10571,10 @@ static const struct address_space_operations btrfs_symlink_aops = { static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, @@ -10251,10 +10585,10 @@ static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, @@ -10265,10 +10599,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 053e677839fe..4e700694b741 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; - if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) - iflags |= FS_COMPR_FL; - else if (flags & BTRFS_INODE_NOCOMPRESS) + if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; + else if (flags & BTRFS_INODE_COMPRESS) + iflags |= FS_COMPR_FL; return iflags; } @@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item root_item; + struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir, u64 qgroup_reserved; uuid_le new_uuid; + root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); + if (!root_item) + return -ENOMEM; + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) - return ret; + goto fail_free; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assume subvolme qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) - return -ENOSPC; + if (btrfs_qgroup_level(objectid)) { + ret = -ENOSPC; + goto fail_free; + } btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, &qgroup_reserved, false); if (ret) - return ret; + goto fail_free; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); - return ret; + goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); - memset(&root_item, 0, sizeof(root_item)); - - inode_item = &root_item.inode; + inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - btrfs_set_root_flags(&root_item, 0); - btrfs_set_root_limit(&root_item, 0); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, leaf->len); - btrfs_set_root_last_snapshot(&root_item, 0); + btrfs_set_root_bytenr(root_item, leaf->start); + btrfs_set_root_generation(root_item, trans->transid); + btrfs_set_root_level(root_item, 0); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_used(root_item, leaf->len); + btrfs_set_root_last_snapshot(root_item, 0); - btrfs_set_root_generation_v2(&root_item, - btrfs_root_generation(&root_item)); + btrfs_set_root_generation_v2(root_item, + btrfs_root_generation(root_item)); uuid_le_gen(&new_uuid); - memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); - btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); - root_item.ctime = root_item.otime; - btrfs_set_root_ctransid(&root_item, trans->transid); - btrfs_set_root_otransid(&root_item, trans->transid); + memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); + root_item->ctime = root_item->otime; + btrfs_set_root_ctransid(root_item, trans->transid); + btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(&root_item, new_dirid); + btrfs_set_root_dirid(root_item, new_dirid); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); + root_item); if (ret) goto fail; @@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir, BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, root, ret); fail: + kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); @@ -629,6 +634,10 @@ fail: d_instantiate(dentry, inode); } return ret; + +fail_free: + kfree(root_item); + return ret; } static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) @@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1); + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -837,9 +846,11 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - if (error == -EINTR) - return error; + inode_lock_nested(dir, I_MUTEX_PARENT); + // XXX: should've been + // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + // if (error == -EINTR) + // return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -898,7 +909,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) u64 end; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); read_unlock(&em_tree->lock); if (em) { @@ -988,7 +999,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; - u64 len = PAGE_CACHE_SIZE; + u64 len = PAGE_SIZE; /* * hopefully we have this extent in the tree already, try without @@ -1124,15 +1135,15 @@ static int cluster_pages_for_defrag(struct inode *inode, struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + file_end = (isize - 1) >> PAGE_SHIFT; if (!isize || start_index > file_end) return 0; page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - start_index << PAGE_CACHE_SHIFT, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_SHIFT, + page_cnt << PAGE_SHIFT); if (ret) return ret; i_done = 0; @@ -1148,7 +1159,7 @@ again: break; page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; while (1) { lock_extent_bits(tree, page_start, page_end, &cached_state); @@ -1169,7 +1180,7 @@ again: */ if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } } @@ -1179,7 +1190,7 @@ again: lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); ret = -EIO; break; } @@ -1187,7 +1198,7 @@ again: if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } @@ -1208,7 +1219,7 @@ again: wait_on_page_writeback(pages[i]); page_start = page_offset(pages[0]); - page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; + page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); @@ -1222,8 +1233,8 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - start_index << PAGE_CACHE_SHIFT, - (page_cnt - i_done) << PAGE_CACHE_SHIFT); + start_index << PAGE_SHIFT, + (page_cnt - i_done) << PAGE_SHIFT); } @@ -1240,17 +1251,17 @@ again: set_page_extent_mapped(pages[i]); set_page_dirty(pages[i]); unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } return i_done; out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } btrfs_delalloc_release_space(inode, - start_index << PAGE_CACHE_SHIFT, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_SHIFT, + page_cnt << PAGE_SHIFT); return ret; } @@ -1273,7 +1284,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT; + unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; unsigned long cluster = max_cluster; u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; @@ -1317,9 +1328,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { last_index = min_t(u64, isize - 1, - range->start + range->len - 1) >> PAGE_CACHE_SHIFT; + range->start + range->len - 1) >> PAGE_SHIFT; } else { - last_index = (isize - 1) >> PAGE_CACHE_SHIFT; + last_index = (isize - 1) >> PAGE_SHIFT; } if (newer_than) { @@ -1331,11 +1342,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * we always align our defrag to help keep * the extents in the file evenly spaced */ - i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + i = (newer_off & new_align) >> PAGE_SHIFT; } else goto out_ra; } else { - i = range->start >> PAGE_CACHE_SHIFT; + i = range->start >> PAGE_SHIFT; } if (!max_to_defrag) max_to_defrag = last_index - i + 1; @@ -1348,7 +1359,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, inode->i_mapping->writeback_index = i; while (i <= last_index && defrag_count < max_to_defrag && - (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) { + (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { /* * make sure we stop running if someone unmounts * the FS @@ -1362,7 +1373,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; } - if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, extent_thresh, &last_len, &skip, &defrag_end, range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { @@ -1371,14 +1382,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the should_defrag function tells us how much to skip * bump our counter by the suggested amount */ - next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE); + next = DIV_ROUND_UP(skip, PAGE_SIZE); i = max(i + 1, next); continue; } if (!newer_than) { - cluster = (PAGE_CACHE_ALIGN(defrag_end) >> - PAGE_CACHE_SHIFT) - i; + cluster = (PAGE_ALIGN(defrag_end) >> + PAGE_SHIFT) - i; cluster = min(cluster, max_cluster); } else { cluster = max_cluster; @@ -1412,20 +1423,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i += ret; newer_off = max(newer_off + 1, - (u64)i << PAGE_CACHE_SHIFT); + (u64)i << PAGE_SHIFT); ret = find_new_extents(root, inode, newer_than, &newer_off, SZ_64K); if (!ret) { range->start = newer_off; - i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + i = (newer_off & new_align) >> PAGE_SHIFT; } else { break; } } else { if (ret > 0) { i += ret; - last_len += ret << PAGE_CACHE_SHIFT; + last_len += ret << PAGE_SHIFT; } else { i++; last_len = 0; @@ -1654,7 +1665,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - btrfs_info(BTRFS_I(src_inode)->root->fs_info, + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { @@ -1722,7 +1733,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - if (vol_args->size > PAGE_CACHE_SIZE) { + if (vol_args->size > PAGE_SIZE) { ret = -EINVAL; goto free_args; } @@ -2366,9 +2377,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; - err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - if (err == -EINTR) - goto out_drop_write; + inode_lock_nested(dir, I_MUTEX_PARENT); + // XXX: should've been + // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + // if (err == -EINTR) + // goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2558,7 +2571,7 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); -out_drop_write: +//out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -2667,10 +2680,10 @@ out: return ret; } -static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2686,7 +2699,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) goto err_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + /* Check for compatibility reject unknown flags */ + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) + return -EOPNOTSUPP; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { @@ -2695,13 +2710,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } mutex_lock(&root->fs_info->volume_mutex); - ret = btrfs_rm_device(root, vol_args->name); + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + ret = btrfs_rm_device(root, NULL, vol_args->devid); + } else { + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name, 0); + } mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); - if (!ret) - btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); - + if (!ret) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) + btrfs_info(root->fs_info, "device deleted: id %llu", + vol_args->devid); + else + btrfs_info(root->fs_info, "device deleted: %s", + vol_args->name); + } out: kfree(vol_args); err_drop: @@ -2709,6 +2734,47 @@ err_drop: return ret; } +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_drop_write; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name, 0); + mutex_unlock(&root->fs_info->volume_mutex); + + if (!ret) + btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + kfree(vol_args); +out: + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out_drop_write: + mnt_drop_write_file(file); + + return ret; +} + static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2806,12 +2872,12 @@ static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); return ERR_PTR(-EIO); } if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); return ERR_PTR(-EAGAIN); } } @@ -2823,7 +2889,7 @@ static int gather_extent_pages(struct inode *inode, struct page **pages, int num_pages, u64 off) { int i; - pgoff_t index = off >> PAGE_CACHE_SHIFT; + pgoff_t index = off >> PAGE_SHIFT; for (i = 0; i < num_pages; i++) { again: @@ -2932,12 +2998,12 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) pg = cmp->src_pages[i]; if (pg) { unlock_page(pg); - page_cache_release(pg); + put_page(pg); } pg = cmp->dst_pages[i]; if (pg) { unlock_page(pg); - page_cache_release(pg); + put_page(pg); } } kfree(cmp->src_pages); @@ -2949,7 +3015,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, u64 len, struct cmp_pages *cmp) { int ret; - int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; struct page **src_pgarr, **dst_pgarr; /* @@ -2987,12 +3053,12 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, int ret = 0; int i; struct page *src_page, *dst_page; - unsigned int cmp_len = PAGE_CACHE_SIZE; + unsigned int cmp_len = PAGE_SIZE; void *addr, *dst_addr; i = 0; while (len) { - if (len < PAGE_CACHE_SIZE) + if (len < PAGE_SIZE) cmp_len = len; BUG_ON(i >= cmp->num_pages); @@ -3191,7 +3257,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, if (olen > BTRFS_MAX_DEDUPE_LEN) olen = BTRFS_MAX_DEDUPE_LEN; - if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { + if (WARN_ON_ONCE(bs < PAGE_SIZE)) { /* * Btrfs does not support blocksize < page_size. As a * result, btrfs_cmp_data() won't correctly handle @@ -3468,13 +3534,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(root->nodesize); - if (!buf) - return ret; + buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN); + if (!buf) { + buf = vmalloc(root->nodesize); + if (!buf) + return ret; + } path = btrfs_alloc_path(); if (!path) { - vfree(buf); + kvfree(buf); return ret; } @@ -3775,7 +3844,7 @@ process_slot: out: btrfs_free_path(path); - vfree(buf); + kvfree(buf); return ret; } @@ -3891,8 +3960,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * data immediately and not the previous data. */ truncate_inode_pages_range(&inode->i_data, - round_down(destoff, PAGE_CACHE_SIZE), - round_up(destoff + len, PAGE_CACHE_SIZE) - 1); + round_down(destoff, PAGE_SIZE), + round_up(destoff + len, PAGE_SIZE) - 1); out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); @@ -4124,7 +4193,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ - if (alloc_size > PAGE_CACHE_SIZE) + if (alloc_size > PAGE_SIZE) return -ENOMEM; space_args.total_spaces = 0; @@ -4376,7 +4445,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { - ret = btrfs_dev_replace_start(root, p); + ret = btrfs_dev_replace_by_ioctl(root, p); atomic_set( &root->fs_info->mutually_exclusive_operation_running, 0); @@ -4847,8 +4916,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_std_error(root->fs_info, ret, - "failed to update qgroup status and info\n"); + btrfs_handle_fs_error(root->fs_info, err, + "failed to update qgroup status and info"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -5394,9 +5463,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) if (ret) return ret; + ret = mnt_want_write_file(file); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop_write; + } spin_lock(&root->fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); @@ -5415,7 +5490,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&root->fs_info->super_lock); - return btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); +out_drop_write: + mnt_drop_write_file(file); + + return ret; } long btrfs_ioctl(struct file *file, unsigned int @@ -5459,6 +5538,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_RM_DEV_V2: + return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: @@ -5552,3 +5633,24 @@ long btrfs_ioctl(struct file *file, unsigned int return -ENOTTY; } + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a2f051347731..1adfbe7be6b8 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -55,8 +55,8 @@ static struct list_head *lzo_alloc_workspace(void) return ERR_PTR(-ENOMEM); workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); - workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); - workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -116,7 +116,7 @@ static int lzo_compress_pages(struct list_head *ws, *total_out = 0; *total_in = 0; - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap(in_page); /* @@ -133,10 +133,10 @@ static int lzo_compress_pages(struct list_head *ws, tot_out = LZO_LEN; pages[0] = out_page; nr_pages = 1; - pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + pg_bytes_left = PAGE_SIZE - LZO_LEN; /* compress at most one page of data each time */ - in_len = min(len, PAGE_CACHE_SIZE); + in_len = min(len, PAGE_SIZE); while (tot_in < len) { ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); @@ -201,7 +201,7 @@ static int lzo_compress_pages(struct list_head *ws, cpage_out = kmap(out_page); pages[nr_pages++] = out_page; - pg_bytes_left = PAGE_CACHE_SIZE; + pg_bytes_left = PAGE_SIZE; out_offset = 0; } } @@ -221,12 +221,12 @@ static int lzo_compress_pages(struct list_head *ws, bytes_left = len - tot_in; kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); - start += PAGE_CACHE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + start += PAGE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap(in_page); - in_len = min(bytes_left, PAGE_CACHE_SIZE); + in_len = min(bytes_left, PAGE_SIZE); } if (tot_out > tot_in) @@ -248,7 +248,7 @@ out: if (in_page) { kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); } return ret; @@ -266,7 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws, char *data_in; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; unsigned long bytes; @@ -289,7 +289,7 @@ static int lzo_decompress_biovec(struct list_head *ws, tot_in = LZO_LEN; in_offset = LZO_LEN; tot_len = min_t(size_t, srclen, tot_len); - in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + in_page_bytes_left = PAGE_SIZE - LZO_LEN; tot_out = 0; pg_offset = 0; @@ -345,12 +345,12 @@ cont: data_in = kmap(pages_in[++page_in_index]); - in_page_bytes_left = PAGE_CACHE_SIZE; + in_page_bytes_left = PAGE_SIZE; in_offset = 0; } } - out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + out_len = lzo1x_worst_compress(PAGE_SIZE); ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); if (need_unmap) @@ -399,7 +399,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, in_len = read_compress_length(data_in); data_in += LZO_LEN; - out_len = PAGE_CACHE_SIZE; + out_len = PAGE_SIZE; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed!\n"); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0de7da5a610d..559170464d7c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len) { - struct list_head splice, works; + LIST_HEAD(splice); + LIST_HEAD(skipped); + LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; int count = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); + const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); + + if (range_end <= ordered->start || + ordered->start + ordered->disk_len <= range_start) { + list_move_tail(&ordered->root_extent_list, &skipped); + cond_resched_lock(&root->ordered_extent_lock); + continue; + } + list_move_tail(&ordered->root_extent_list, &root->ordered_extents); atomic_inc(&ordered->refs); @@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) nr--; count++; } + list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); @@ -708,7 +718,8 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; @@ -728,7 +739,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr); + done = btrfs_wait_ordered_extents(root, nr, + range_start, range_len); btrfs_put_fs_root(root); spin_lock(&fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 23c96059cef2..8ef12623d65c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, const loff_t start, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5279fdae7142..9e119552ed32 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1463,6 +1463,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); + trace_btrfs_qgroup_insert_dirty_extent(record); while (*p) { parent_node = *p; @@ -1594,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + trace_qgroup_update_counters(qg->qgroupid, cur_old_count, + cur_new_count); + /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; @@ -1683,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); + trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, + nr_new_roots); + qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { ret = -ENOMEM; @@ -1752,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + trace_btrfs_qgroup_account_extents(record); + if (!ret) { /* * Use (u64)-1 as time_seq to do special search, which @@ -1842,8 +1851,10 @@ out: } /* - * copy the acounting information between qgroups. This is necessary when a - * snapshot or a subvolume is created + * Copy the acounting information between qgroups. This is necessary + * when a snapshot or a subvolume is created. Throwing an error will + * cause a transaction abort so we take extra care here to only error + * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, @@ -1873,15 +1884,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, 2 * inherit->num_excl_copies; for (i = 0; i < nums; ++i) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); - if (!srcgroup) { - ret = -EINVAL; - goto out; - } - if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { - ret = -EINVAL; - goto out; - } + /* + * Zero out invalid groups so we can ignore + * them later. + */ + if (!srcgroup || + ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) + *i_qgroups = 0ULL; + ++i_qgroups; } } @@ -1916,17 +1927,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i) { + for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { + if (*i_qgroups == 0) + continue; ret = add_qgroup_relation_item(trans, quota_root, objectid, *i_qgroups); - if (ret) + if (ret && ret != -EEXIST) goto out; ret = add_qgroup_relation_item(trans, quota_root, *i_qgroups, objectid); - if (ret) + if (ret && ret != -EEXIST) goto out; - ++i_qgroups; } + ret = 0; } @@ -1987,17 +2000,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { - ret = add_relation_rb(quota_root->fs_info, objectid, - *i_qgroups); - if (ret) - goto unlock; + if (*i_qgroups) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + } ++i_qgroups; } - for (i = 0; i < inherit->num_ref_copies; ++i) { + for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2008,12 +2026,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->rfer = src->rfer - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size; - i_qgroups += 2; } - for (i = 0; i < inherit->num_excl_copies; ++i) { + for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2024,7 +2044,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->excl = src->excl + level_size; dst->excl_cmpr = src->excl_cmpr + level_size; - i_qgroups += 2; } unlock: diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 55161369fab1..0b7792e02dd5 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -270,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) s = kmap(rbio->bio_pages[i]); d = kmap(rbio->stripe_pages[i]); - memcpy(d, s, PAGE_CACHE_SIZE); + memcpy(d, s, PAGE_SIZE); kunmap(rbio->bio_pages[i]); kunmap(rbio->stripe_pages[i]); @@ -962,7 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, */ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { - return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; + return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; } /* @@ -1078,7 +1078,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, u64 disk_start; stripe = &rbio->bbio->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + disk_start = stripe->physical + (page_index << PAGE_SHIFT); /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) @@ -1096,8 +1096,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, if (last_end == disk_start && stripe->dev->bdev && !last->bi_error && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); - if (ret == PAGE_CACHE_SIZE) + ret = bio_add_page(last, page, PAGE_SIZE, 0); + if (ret == PAGE_SIZE) return 0; } } @@ -1111,7 +1111,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; - bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_add_page(bio, page, PAGE_SIZE, 0); bio_list_add(bio_list, bio); return 0; } @@ -1154,7 +1154,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) bio_list_for_each(bio, &rbio->bio_list) { start = (u64)bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->bbio->raid_map[0]; - page_index = stripe_offset >> PAGE_CACHE_SHIFT; + page_index = stripe_offset >> PAGE_SHIFT; for (i = 0; i < bio->bi_vcnt; i++) { p = bio->bi_io_vec[i].bv_page; @@ -1253,7 +1253,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } else { /* raid5 */ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); - run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } @@ -1914,7 +1914,7 @@ pstripe: /* Copy parity block into failed block to start with */ memcpy(pointers[faila], pointers[rbio->nr_data], - PAGE_CACHE_SIZE); + PAGE_SIZE); /* rearrange the pointer array */ p = pointers[faila]; @@ -1923,7 +1923,7 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); + run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); } /* if we're doing this rebuild as part of an rmw, go through * and set all of our private rbio pages in the @@ -2250,7 +2250,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + rbio->stripe_len * rbio->nr_data); stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); - index = stripe_offset >> PAGE_CACHE_SHIFT; + index = stripe_offset >> PAGE_SHIFT; rbio->bio_pages[index] = page; } @@ -2365,14 +2365,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, } else { /* raid5 */ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); - run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } /* Check scrubbing pairty and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); parity = kmap(p); - if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) - memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); + if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) + memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE); else /* Parity is right, needn't writeback */ bitmap_clear(rbio->dbitmap, pagenr, 1); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b892914968c1..298631eaee78 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -226,7 +226,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info, /* find extent */ spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); if (re) re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -257,7 +257,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_CACHE_SHIFT, 1); + logical >> PAGE_SHIFT, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) { kref_get(&zone->refcnt); spin_unlock(&fs_info->reada_lock); @@ -294,13 +294,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), + (unsigned long)(zone->end >> PAGE_SHIFT), zone); if (ret == -EEXIST) { kfree(zone); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_CACHE_SHIFT, 1); + logical >> PAGE_SHIFT, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) kref_get(&zone->refcnt); else @@ -326,7 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 length; int real_stripes; int nzones = 0; - unsigned long index = logical >> PAGE_CACHE_SHIFT; + unsigned long index = logical >> PAGE_SHIFT; int dev_replace_is_ongoing; int have_zone = 0; @@ -495,7 +495,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, struct reada_extent *re) { int i; - unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + unsigned long index = re->logical >> PAGE_SHIFT; spin_lock(&fs_info->reada_lock); if (--re->refcnt) { @@ -538,7 +538,7 @@ static void reada_zone_release(struct kref *kref) struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); radix_tree_delete(&zone->device->reada_zones, - zone->end >> PAGE_CACHE_SHIFT); + zone->end >> PAGE_SHIFT); kfree(zone); } @@ -587,7 +587,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) { int i; - unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + unsigned long index = zone->end >> PAGE_SHIFT; for (i = 0; i < zone->ndevs; ++i) { struct reada_zone *peer; @@ -622,7 +622,7 @@ static int reada_pick_zone(struct btrfs_device *dev) (void **)&zone, index, 1); if (ret == 0) break; - index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + index = (zone->end >> PAGE_SHIFT) + 1; if (zone->locked) { if (zone->elems > top_locked_elems) { top_locked_elems = zone->elems; @@ -673,7 +673,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, * plugging to speed things up */ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_CACHE_SHIFT, 1); + dev->reada_next >> PAGE_SHIFT, 1); if (ret == 0 || re->logical > dev->reada_curr_zone->end) { ret = reada_pick_zone(dev); if (!ret) { @@ -682,7 +682,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } re = NULL; ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_CACHE_SHIFT, 1); + dev->reada_next >> PAGE_SHIFT, 1); } if (ret == 0) { spin_unlock(&fs_info->reada_lock); @@ -838,7 +838,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) printk(KERN_CONT " curr off %llu", device->reada_next - zone->start); printk(KERN_CONT "\n"); - index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + index = (zone->end >> PAGE_SHIFT) + 1; } cnt = 0; index = 0; @@ -864,7 +864,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } printk(KERN_CONT "\n"); - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + index = (re->logical >> PAGE_SHIFT) + 1; if (++cnt > 15) break; } @@ -880,7 +880,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; if (!re->scheduled) { - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + index = (re->logical >> PAGE_SHIFT) + 1; continue; } printk(KERN_DEBUG @@ -897,7 +897,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } printk(KERN_CONT "\n"); - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + index = (re->logical >> PAGE_SHIFT) + 1; } spin_unlock(&fs_info->reada_lock); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2bd0011450df..1cfd35cfac76 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1850,6 +1850,7 @@ again: eb = read_tree_block(dest, old_bytenr, old_ptr_gen); if (IS_ERR(eb)) { ret = PTR_ERR(eb); + break; } else if (!extent_buffer_uptodate(eb)) { ret = -EIO; free_extent_buffer(eb); @@ -2417,7 +2418,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -3129,10 +3130,10 @@ static int relocate_file_extent_cluster(struct inode *inode, if (ret) goto out; - index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; - last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + index = (cluster->start - offset) >> PAGE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_SHIFT; while (index <= last_index) { - ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE); if (ret) goto out; @@ -3145,7 +3146,7 @@ static int relocate_file_extent_cluster(struct inode *inode, mask); if (!page) { btrfs_delalloc_release_metadata(inode, - PAGE_CACHE_SIZE); + PAGE_SIZE); ret = -ENOMEM; goto out; } @@ -3162,16 +3163,16 @@ static int relocate_file_extent_cluster(struct inode *inode, lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); btrfs_delalloc_release_metadata(inode, - PAGE_CACHE_SIZE); + PAGE_SIZE); ret = -EIO; goto out; } } page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); @@ -3191,7 +3192,7 @@ static int relocate_file_extent_cluster(struct inode *inode, unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); unlock_page(page); - page_cache_release(page); + put_page(page); index++; balance_dirty_pages_ratelimited(inode->i_mapping); @@ -4253,12 +4254,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); - if (ret < 0) { - err = ret; - goto out; - } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_block_group_reservations(rc->block_group); + btrfs_wait_nocow_writers(rc->block_group); + btrfs_wait_ordered_roots(fs_info, -1, + rc->block_group->key.objectid, + rc->block_group->key.offset); while (1) { mutex_lock(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9fcd6dfc3266..b2b14e7115f1 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 39dbdcbf4d13..fa35cdc46494 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -703,7 +703,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - index = offset >> PAGE_CACHE_SHIFT; + index = offset >> PAGE_SHIFT; page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { @@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bbio = bbio; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); @@ -1636,7 +1636,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, if (spage->io_error) { void *mapped_buffer = kmap_atomic(spage->page); - memset(mapped_buffer, 0, PAGE_CACHE_SIZE); + memset(mapped_buffer, 0, PAGE_SIZE); flush_dcache_page(spage->page); kunmap_atomic(mapped_buffer); } @@ -2127,6 +2127,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio) if (bio->bi_error) sblock->no_io_error_seen = 0; + bio_put(bio); + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); } @@ -2860,7 +2862,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = map->stripe_len / root->sectorsize; + nsectors = div_u64(map->stripe_len, root->sectorsize); bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); @@ -3070,7 +3072,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int slot; u64 nstripes; struct extent_buffer *l; - struct btrfs_key key; u64 physical; u64 logical; u64 logic_end; @@ -3079,7 +3080,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int mirror_num; struct reada_control *reada1; struct reada_control *reada2; - struct btrfs_key key_start; + struct btrfs_key key; struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -3158,21 +3159,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ - key_start.objectid = logical; - key_start.type = BTRFS_EXTENT_ITEM_KEY; - key_start.offset = (u64)0; + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key_start, &key_end); + reada1 = btrfs_reada_add(root, &key, &key_end); - key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_start.type = BTRFS_EXTENT_CSUM_KEY; - key_start.offset = logical; + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + reada2 = btrfs_reada_add(csum_root, &key, &key_end); if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); @@ -4294,8 +4295,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, goto out; } - while (len >= PAGE_CACHE_SIZE) { - index = offset >> PAGE_CACHE_SHIFT; + while (len >= PAGE_SIZE) { + index = offset >> PAGE_SHIFT; again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { @@ -4326,7 +4327,7 @@ again: */ if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } if (!PageUptodate(page)) { @@ -4348,15 +4349,15 @@ again: ret = err; next_page: unlock_page(page); - page_cache_release(page); + put_page(page); if (ret) break; - offset += PAGE_CACHE_SIZE; - physical_for_dev_replace += PAGE_CACHE_SIZE; - nocow_ctx_logical += PAGE_CACHE_SIZE; - len -= PAGE_CACHE_SIZE; + offset += PAGE_SIZE; + physical_for_dev_replace += PAGE_SIZE; + nocow_ctx_logical += PAGE_SIZE; + len -= PAGE_SIZE; } ret = COPY_COMPLETE; out: @@ -4390,8 +4391,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; - ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); - if (ret != PAGE_CACHE_SIZE) { + ret = bio_add_page(bio, page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { leave_with_eio: bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 19b7bf4284ee..6a8c86074aa4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4449,9 +4449,9 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) struct page *page; char *addr; struct btrfs_key key; - pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; - unsigned pg_offset = offset & ~PAGE_CACHE_MASK; + unsigned pg_offset = offset & ~PAGE_MASK; ssize_t ret = 0; key.objectid = sctx->cur_ino; @@ -4471,7 +4471,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) if (len == 0) goto out; - last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ memset(&sctx->ra, 0, sizeof(struct file_ra_state)); @@ -4481,7 +4481,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, - PAGE_CACHE_SIZE - pg_offset); + PAGE_SIZE - pg_offset); page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { ret = -ENOMEM; @@ -4493,7 +4493,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); ret = -EIO; break; } @@ -4503,7 +4503,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); kunmap(page); unlock_page(page); - page_cache_release(page); + put_page(page); index++; pg_offset = 0; len -= cur_len; @@ -4804,7 +4804,7 @@ static int clone_range(struct send_ctx *sctx, type = btrfs_file_extent_type(leaf, ei); if (type == BTRFS_FILE_EXTENT_INLINE) { ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); - ext_len = PAGE_CACHE_ALIGN(ext_len); + ext_len = PAGE_ALIGN(ext_len); } else { ext_len = btrfs_file_extent_num_bytes(leaf, ei); } @@ -4886,7 +4886,7 @@ static int send_write_or_clone(struct send_ctx *sctx, * but there may be items after this page. Make * sure to send the whole thing */ - len = PAGE_CACHE_ALIGN(len); + len = PAGE_ALIGN(len); } else { len = btrfs_file_extent_num_bytes(path->nodes[0], ei); } @@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; + unsigned alloc_size; int sort_clone_roots = 0; int index; @@ -5978,6 +5979,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->clone_sources_count > + ULLONG_MAX / sizeof(*arg->clone_sources)) { + ret = -EINVAL; + goto out; + } + if (!access_ok(VERIFY_READ, arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { @@ -6022,40 +6029,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = vmalloc(sctx->send_max_size); + sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } } - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } } sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - sctx->clone_roots = vzalloc(sizeof(struct clone_root) * - (arg->clone_sources_count + 1)); + alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); + + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; + sctx->clone_roots = vzalloc(alloc_size); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } } + alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(arg->clone_sources_count * - sizeof(*arg->clone_sources)); + clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; + clone_sources_tmp = vmalloc(alloc_size); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, - arg->clone_sources_count * - sizeof(*arg->clone_sources)); + alloc_size); if (ret) { ret = -EFAULT; goto out; @@ -6089,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; } - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); clone_sources_tmp = NULL; } @@ -6207,15 +6227,15 @@ out: btrfs_root_dec_send_in_progress(sctx->parent_root); kfree(arg); - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); if (sctx) { if (sctx->send_filp) fput(sctx->send_filp); - vfree(sctx->clone_roots); - vfree(sctx->send_buf); - vfree(sctx->read_buf); + kvfree(sctx->clone_roots); + kvfree(sctx->send_buf); + kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index b976597b0721..e05619f241be 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -66,7 +66,7 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ \ if (token && token->kaddr && token->offset <= offset && \ token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ res = get_unaligned_le##bits(p + off); \ @@ -104,7 +104,7 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ \ if (token && token->kaddr && token->offset <= offset && \ token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ put_unaligned_le##bits(val, p + off); \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..bf71071ab6f6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno) return errstr; } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - /* - * today we only save the error info into ram. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); -} - /* btrfs handle error by forcing the filesystem readonly */ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { @@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } /* - * __btrfs_std_error decodes expected errors from the caller and + * __btrfs_handle_fs_error decodes expected errors from the caller and * invokes the approciate error response. */ __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, } #endif + /* + * Today we only save the error info to memory. Long term we'll + * also send it down to the disk + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + /* Don't go through full error handling during mount */ - save_error_info(fs_info); if (sb->s_flags & MS_BORN) btrfs_handle_error(fs_info); } @@ -252,7 +248,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, /* Wake up anybody who may be waiting on this transaction */ wake_up(&root->fs_info->transaction_wait); wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_std_error(root->fs_info, function, line, errno, NULL); + __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, @@ -1160,7 +1156,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1488,10 +1484,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); } else { /* - * Since SELinux(the only one supports security_mnt_opts) does - * NOT support changing context during remount/mount same sb, - * This must be the same or part of the same security options, - * just free it. + * Since SELinux (the only one supporting security_mnt_opts) + * does NOT support changing context during remount/mount of + * the same sb, this must be the same or part of the same + * security options, just free it. */ security_free_mnt_opts(sec_opts); } @@ -1669,8 +1665,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long old_opts) { /* - * We need cleanup all defragable inodes if the autodefragment is - * close or the fs is R/O. + * We need to cleanup all defragable inodes if the autodefragment is + * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || @@ -2051,9 +2047,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; + int mixed = 0; /* - * holding chunk_muext to avoid allocating new chunks, holding + * holding chunk_mutex to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ rcu_read_lock(); @@ -2076,8 +2073,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - total_free_meta += found->disk_total - found->disk_used; + + /* + * Metadata in mixed block goup profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } total_used += found->disk_used; } @@ -2090,7 +2096,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* Account global block reserve as used, it's in logical size already */ spin_lock(&block_rsv->lock); - buf->f_bfree -= block_rsv->size >> bits; + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); @@ -2115,7 +2125,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (total_free_meta - thresh < block_rsv->size) + if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 539e7b5e3f86..4879656bda3c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, if (!fs_info) return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; @@ -364,7 +367,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; - return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ssize_t ret; + + spin_lock(&fs_info->super_lock); + ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + spin_unlock(&fs_info->super_lock); + + return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, @@ -374,6 +383,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; + if (!fs_info) + return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 669b58201e36..70948b13bc81 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -32,8 +32,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, { int ret; struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; int count = 0; @@ -49,9 +49,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, count++; if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); if (flags & PROCESS_RELEASE) - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -93,7 +93,7 @@ static int test_find_delalloc(void) * everything to make sure our pages don't get evicted and screw up our * test. */ - for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { + for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_msg("Failed to allocate test page\n"); @@ -104,7 +104,7 @@ static int test_find_delalloc(void) if (index) { unlock_page(page); } else { - page_cache_get(page); + get_page(page); locked_page = page; } } @@ -129,7 +129,7 @@ static int test_find_delalloc(void) } unlock_extent(&tmp, start, end); unlock_page(locked_page); - page_cache_release(locked_page); + put_page(locked_page); /* * Test this scenario @@ -139,7 +139,7 @@ static int test_find_delalloc(void) */ test_start = SZ_64M; locked_page = find_lock_page(inode->i_mapping, - test_start >> PAGE_CACHE_SHIFT); + test_start >> PAGE_SHIFT); if (!locked_page) { test_msg("Couldn't find the locked page\n"); goto out_bits; @@ -165,7 +165,7 @@ static int test_find_delalloc(void) } unlock_extent(&tmp, start, end); /* locked_page was unlocked above */ - page_cache_release(locked_page); + put_page(locked_page); /* * Test this scenario @@ -174,7 +174,7 @@ static int test_find_delalloc(void) */ test_start = max_bytes + 4096; locked_page = find_lock_page(inode->i_mapping, test_start >> - PAGE_CACHE_SHIFT); + PAGE_SHIFT); if (!locked_page) { test_msg("Could'nt find the locked page\n"); goto out_bits; @@ -225,13 +225,13 @@ static int test_find_delalloc(void) * range we want to find. */ page = find_get_page(inode->i_mapping, - (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT); + (max_bytes + SZ_1M) >> PAGE_SHIFT); if (!page) { test_msg("Couldn't find our page\n"); goto out_bits; } ClearPageDirty(page); - page_cache_release(page); + put_page(page); /* We unlocked it in the previous test */ lock_page(locked_page); @@ -239,7 +239,7 @@ static int test_find_delalloc(void) end = 0; /* * Currently if we fail to find dirty pages in the delalloc range we - * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If + * will adjust max_bytes down to PAGE_SIZE and then re-search. If * this changes at any point in the future we will need to fix this * tests expected behavior. */ @@ -249,9 +249,9 @@ static int test_find_delalloc(void) test_msg("Didn't find our range\n"); goto out_bits; } - if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) { + if (start != test_start && end != test_start + PAGE_SIZE - 1) { test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", - test_start, test_start + PAGE_CACHE_SIZE - 1, start, + test_start, test_start + PAGE_SIZE - 1, start, end); goto out_bits; } @@ -265,7 +265,7 @@ out_bits: clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); out: if (locked_page) - page_cache_release(locked_page); + put_page(locked_page); process_page_range(inode, 0, total_dirty - 1, PROCESS_UNLOCK | PROCESS_RELEASE); iput(inode); @@ -298,9 +298,9 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return -EINVAL; } - bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { test_msg("Setting straddling pages failed\n"); @@ -309,10 +309,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, bitmap_set(bitmap, 0, len * BITS_PER_BYTE); bitmap_clear(bitmap, - (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, sizeof(long) * BITS_PER_BYTE); extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { test_msg("Clearing straddling pages failed\n"); @@ -353,7 +353,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, static int test_eb_bitmaps(void) { - unsigned long len = PAGE_CACHE_SIZE * 4; + unsigned long len = PAGE_SIZE * 4; unsigned long *bitmap; struct extent_buffer *eb; int ret; @@ -379,7 +379,7 @@ static int test_eb_bitmaps(void) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len); + eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len); if (!eb) { test_msg("Couldn't allocate test extent buffer\n"); kfree(bitmap); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index c9ad97b1e690..514247515312 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -22,7 +22,7 @@ #include "../disk-io.h" #include "../free-space-cache.h" -#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8) /* * This test just does basic sanity checking, making sure we can add an exten diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..5b0b758a3f79 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -311,10 +311,11 @@ loop: * when the transaction commits */ static int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + int force) { - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - root->last_trans < trans->transid) { + if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) || force) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); @@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&root->fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid) { + if (root->last_trans == trans->transid && !force) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } @@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&root->fs_info->reloc_mutex); - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); mutex_unlock(&root->fs_info->reloc_mutex); return 0; @@ -1310,6 +1311,97 @@ int btrfs_defrag_root(struct btrfs_root *root) return ret; } +/* Bisesctability fixup, remove in 4.8 */ +#ifndef btrfs_std_error +#define btrfs_std_error btrfs_handle_fs_error +#endif + +/* + * Do all special snapshot related qgroup dirty hack. + * + * Will do all needed qgroup inherit and dirty hack like switch commit + * roots inside one transaction and write all btree into disk, to make + * qgroup works. + */ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_root *src, + struct btrfs_root *parent, + struct btrfs_qgroup_inherit *inherit, + u64 dst_objectid) +{ + struct btrfs_fs_info *fs_info = src->fs_info; + int ret; + + /* + * Save some performance in the case that qgroups are not + * enabled. If this check races with the ioctl, rescan will + * kick in anyway. + */ + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) { + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return 0; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * We are going to commit transaction, see btrfs_commit_transaction() + * comment for reason locking tree_log_mutex + */ + mutex_lock(&fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, src); + if (ret) + goto out; + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret < 0) + goto out; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret < 0) + goto out; + + /* Now qgroup are all updated, we can inherit it to new qgroups */ + ret = btrfs_qgroup_inherit(trans, fs_info, + src->root_key.objectid, dst_objectid, + inherit); + if (ret < 0) + goto out; + + /* + * Now we do a simplified commit transaction, which will: + * 1) commit all subvolume and extent tree + * To ensure all subvolume and extent tree have a valid + * commit_root to accounting later insert_dir_item() + * 2) write all btree blocks onto disk + * This is to make sure later btree modification will be cowed + * Or commit_root can be populated and cause wrong qgroup numbers + * In this simplified commit, we don't really care about other trees + * like chunk and root tree, as they won't affect qgroup. + * And we don't write super to avoid half committed status. + */ + ret = commit_cowonly_roots(trans, src); + if (ret) + goto out; + switch_commit_roots(trans->transaction, fs_info); + ret = btrfs_write_and_wait_transaction(trans, src); + if (ret) + btrfs_std_error(fs_info, ret, + "Error while writing out transaction for qgroup"); + +out: + mutex_unlock(&fs_info->tree_log_mutex); + + /* + * Force parent root to be updated, as we recorded it before so its + * last_trans == cur_transid. + * Or it won't be committed again onto disk after later + * insert_dir_item() + */ + if (!ret) + record_root_in_trans(trans, parent, 1); + return ret; +} + /* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. @@ -1383,7 +1475,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root, 0); cur_time = current_fs_time(parent_inode->i_sb); @@ -1420,7 +1512,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1516,6 +1608,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * Do special qgroup accounting for snapshot, as we do some qgroup + * snapshot hack to do fast snapshot. + * To co-operate with that hack, we do hack again. + * Or snapshot will be greatly slowed down by a subtree qgroup rescan + */ + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + if (ret < 0) + goto fail; + ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, @@ -1559,23 +1662,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - /* - * account qgroup counters before qgroup_inherit() - */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: @@ -1821,7 +1907,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } static inline void @@ -2145,7 +2231,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 24d03c751149..8aaca5c6af94 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); + down_write(&BTRFS_I(inode)->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); + btrfs_get_logged_extents(inode, logged_list, start, end); /* - * Collect any new ordered extents within the range. This is to - * prevent logging file extent items without waiting for the disk - * location they point to being written. We do this only to deal - * with races against concurrent lockless direct IO writes. + * Some ordered extents started by fsync might have completed + * before we could collect them into the list logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. */ - btrfs_get_logged_extents(inode, logged_list, start, end); + ret = btrfs_inode_check_errors(inode); + if (ret) + ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4202,6 +4210,7 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); + up_write(&BTRFS_I(inode)->dio_sem); btrfs_release_path(path); return ret; @@ -4415,6 +4424,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, return ret; } +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * <power fail> + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * <power fail> + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * <power fail> + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct inode *inode) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size_nr(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, + search_path, parent, + name, this_name_len, 0); + if (di && !IS_ERR(di)) { + ret = 1; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4502,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); /* - * Collect ordered extents only if we are logging data. This is to - * ensure a subsequent request to log this inode in LOG_INODE_ALL mode - * will process the ordered extents if they still exists at the time, - * because when we collect them we test and set for the flag - * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the - * same ordered extents. The consequence for the LOG_INODE_ALL log mode - * not processing the ordered extents is that we end up logging the - * corresponding file extent items, based on the extent maps in the - * inode's extent_map_tree's modified_list, without logging the - * respective checksums (since the may still be only attached to the - * ordered extents and have not been inserted in the csum tree by - * btrfs_finish_ordered_io() yet). - */ - if (inode_only == LOG_INODE_ALL) - btrfs_get_logged_extents(inode, &logged_list, start, end); - - /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ @@ -4602,6 +4715,22 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + if ((min_key.type == BTRFS_INODE_REF_KEY || + min_key.type == BTRFS_INODE_EXTREF_KEY) && + BTRFS_I(inode)->generation == trans->transid) { + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], + &min_key, inode); + if (ret < 0) { + err = ret; + goto out_unlock; + } else if (ret > 0) { + err = 1; + btrfs_set_log_full_commit(root->fs_info, trans); + goto out_unlock; + } + } + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ if (min_key.type == BTRFS_XATTR_ITEM_KEY) { if (ins_nr == 0) @@ -4709,21 +4838,6 @@ log_extents: goto out_unlock; } if (fast_search) { - /* - * Some ordered extents started by fsync might have completed - * before we collected the ordered extents in logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - err = btrfs_inode_check_errors(inode); - if (err) { - ctx->io_err = err; - goto out_unlock; - } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx, start, end); if (ret) { @@ -4851,7 +4965,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, goto out; if (!S_ISDIR(inode->i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) goto out; inode = d_inode(parent); } @@ -4872,7 +4986,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; } - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; if (IS_ROOT(parent)) @@ -5021,7 +5135,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) + if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, @@ -5141,11 +5255,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (IS_ERR(dir_inode)) continue; + if (ctx) + ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); if (!ret && btrfs_must_commit_transaction(trans, dir_inode)) ret = 1; + if (!ret && ctx && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, root, + dir_inode, ctx); iput(dir_inode); if (ret) goto out; @@ -5285,7 +5404,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, } while (1) { - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; inode = d_inode(parent); @@ -5382,7 +5501,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_std_error(fs_info, ret, "Failed to pin buffers while " + btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5396,7 +5515,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5414,7 +5533,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5429,7 +5548,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_std_error(fs_info, ret, "Couldn't read target root " + btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } @@ -5515,11 +5634,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) { - mutex_lock(&BTRFS_I(inode)->log_mutex); - BTRFS_I(inode)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(inode)->log_mutex); - } + mutex_lock(&BTRFS_I(inode)->log_mutex); + BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); /* * if this directory was already logged any new diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e2b54d546b7c..2b88127bba5b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,13 +20,13 @@ #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/semaphore.h> +#include <linux/uuid.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, }; +/* + * Table to convert BTRFS_RAID_* to the error code if minimum number of devices + * condition is not met. Zero means there's no corresponding + * BTRFS_ERROR_DEV_*_NOT_MET value. + */ +const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + [BTRFS_RAID_DUP] = 0, + [BTRFS_RAID_RAID0] = 0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -699,7 +714,8 @@ static noinline int device_list_add(const char *path, * if there is new btrfs on an already registered device, * then remove the stale device entry. */ - btrfs_free_stale_device(device); + if (ret > 0) + btrfs_free_stale_device(device); *fs_devices_ret = fs_devices; @@ -988,6 +1004,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +void btrfs_release_disk_super(struct page *page) +{ + kunmap(page); + put_page(page); +} + +int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, struct btrfs_super_block **disk_super) +{ + void *p; + pgoff_t index; + + /* make sure our super fits in the device */ + if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + return 1; + + /* make sure our super fits in the page */ + if (sizeof(**disk_super) > PAGE_SIZE) + return 1; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_SHIFT; + if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) + return 1; + + /* pull in the page with our super */ + *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_KERNEL); + + if (IS_ERR_OR_NULL(*page)) + return 1; + + p = kmap(*page); + + /* align our pointer to the offset of the super block */ + *disk_super = p + (bytenr & ~PAGE_MASK); + + if (btrfs_super_bytenr(*disk_super) != bytenr || + btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { + btrfs_release_disk_super(*page); + return 1; + } + + if ((*disk_super)->label[0] && + (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) + (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -999,13 +1065,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_super_block *disk_super; struct block_device *bdev; struct page *page; - void *p; int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; u64 bytenr; - pgoff_t index; /* * we would like to check all the supers, but that would make @@ -1018,41 +1082,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto error; } - /* make sure our super fits in the device */ - if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) goto error_bdev_put; - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_CACHE_SIZE) - goto error_bdev_put; - - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_CACHE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) - goto error_bdev_put; - - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - index, GFP_NOFS); - - if (IS_ERR_OR_NULL(page)) - goto error_bdev_put; - - p = kmap(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + (bytenr & ~PAGE_CACHE_MASK); - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) - goto error_unmap; - devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); @@ -1060,8 +1097,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (ret > 0) { if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); } else { printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); @@ -1073,9 +1108,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; -error_unmap: - kunmap(page); - page_cache_release(page); + btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); @@ -1454,7 +1487,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_std_error(root->fs_info, ret, "Slot search failed"); + btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1462,7 +1495,7 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); @@ -1688,32 +1721,92 @@ out: return ret; } -int btrfs_rm_device(struct btrfs_root *root, char *device_path) +/* + * Verify that @num_devices satisfies the RAID profile constraints in the whole + * filesystem. It's up to the caller to adjust that number regarding eg. device + * replace. + */ +static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, + u64 num_devices) +{ + u64 all_avail; + unsigned seq; + int i; + + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + all_avail = fs_info->avail_data_alloc_bits | + fs_info->avail_system_alloc_bits | + fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!(all_avail & btrfs_raid_group[i])) + continue; + + if (num_devices < btrfs_raid_array[i].devs_min) { + int ret = btrfs_raid_mindev_error[i]; + + if (ret) + return ret; + } + } + + return 0; +} + +struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, + struct btrfs_device *device) { - struct btrfs_device *device; struct btrfs_device *next_device; - struct block_device *bdev; - struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super; + + list_for_each_entry(next_device, &fs_devs->devices, dev_list) { + if (next_device != device && + !next_device->missing && next_device->bdev) + return next_device; + } + + return NULL; +} + +/* + * Helper function to check if the given device is part of s_bdev / latest_bdev + * and replace it with the provided or the next active device, in the context + * where this function called, there should be always be another device (or + * this_dev) which is active. + */ +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev) +{ + struct btrfs_device *next_device; + + if (this_dev) + next_device = this_dev; + else + next_device = btrfs_find_next_active_device(fs_info->fs_devices, + device); + ASSERT(next_device); + + if (fs_info->sb->s_bdev && + (fs_info->sb->s_bdev == device->bdev)) + fs_info->sb->s_bdev = next_device->bdev; + + if (fs_info->fs_devices->latest_bdev == device->bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) +{ + struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - u64 all_avail; - u64 devid; u64 num_devices; - u8 *dev_uuid; - unsigned seq; int ret = 0; bool clear_super = false; + char *dev_name = NULL; mutex_lock(&uuid_mutex); - do { - seq = read_seqbegin(&root->fs_info->profiles_lock); - - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&root->fs_info->profiles_lock, seq)); - num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { @@ -1722,78 +1815,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; - goto out; - } - - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; + ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1); + if (ret) goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - root->fs_info->fs_devices->rw_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; - goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - root->fs_info->fs_devices->rw_devices <= 3) { - ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; + ret = btrfs_find_device_by_devspec(root, devid, device_path, + &device); + if (ret) goto out; - } - - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - device = NULL; - devices = &root->fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held. - */ - list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && - !tmp->is_tgtdev_for_dev_replace && - !tmp->bdev) { - device = tmp; - break; - } - } - bdev = NULL; - bh = NULL; - disk_super = NULL; - if (!device) { - ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - goto out; - } - } else { - ret = btrfs_get_bdev_and_sb(device_path, - FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder, 0, - &bdev, &bh); - if (ret) - goto out; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root->fs_info, devid, dev_uuid, - disk_super->fsid); - if (!device) { - ret = -ENOENT; - goto error_brelse; - } - } if (device->is_tgtdev_for_dev_replace) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto error_brelse; + goto out; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto error_brelse; + goto out; } if (device->writeable) { @@ -1801,6 +1839,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; unlock_chunks(root); + dev_name = kstrdup(device->name->str, GFP_KERNEL); + if (!dev_name) { + ret = -ENOMEM; + goto error_undo; + } clear_super = true; } @@ -1842,12 +1885,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->missing) device->fs_devices->missing_devices--; - next_device = list_entry(root->fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (device->bdev == root->fs_info->sb->s_bdev) - root->fs_info->sb->s_bdev = next_device->bdev; - if (device->bdev == root->fs_info->fs_devices->latest_bdev) - root->fs_info->fs_devices->latest_bdev = next_device->bdev; + btrfs_assign_next_active_device(root->fs_info, device, NULL); if (device->bdev) { device->fs_devices->open_devices--; @@ -1883,63 +1921,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super && disk_super) { - u64 bytenr; - int i; - - /* make sure this device isn't detected as part of - * the FS anymore - */ - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - - /* clear the mirror copies of super block on the disk - * being removed, 0th copy is been taken care above and - * the below would take of the rest - */ - for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - - brelse(bh); - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - continue; - } - memset(&disk_super->magic, 0, - sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); + if (clear_super) { + struct block_device *bdev; + + bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); + if (!IS_ERR(bdev)) { + btrfs_scratch_superblocks(bdev, dev_name); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); } } - ret = 0; - - if (bdev) { - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); - } - -error_brelse: - brelse(bh); - if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: + kfree(dev_name); + mutex_unlock(&uuid_mutex); return ret; + error_undo: if (device->writeable) { lock_chunks(root); @@ -1948,7 +1946,7 @@ error_undo: device->fs_devices->rw_devices++; unlock_chunks(root); } - goto error_brelse; + goto out; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, @@ -1972,11 +1970,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->missing) fs_devices->missing_devices--; - if (srcdev->writeable) { + if (srcdev->writeable) fs_devices->rw_devices--; - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); - } if (srcdev->bdev) fs_devices->open_devices--; @@ -1987,6 +1982,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + if (srcdev->writeable) { + /* zero out the old super if it is writable */ + btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + } call_rcu(&srcdev->rcu, free_device); /* @@ -2016,32 +2015,33 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - struct btrfs_device *next_device; - mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); - if (tgtdev->bdev) { - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + if (tgtdev->bdev) fs_info->fs_devices->open_devices--; - } + fs_info->fs_devices->num_devices--; - next_device = list_entry(fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (tgtdev->bdev == fs_info->sb->s_bdev) - fs_info->sb->s_bdev = next_device->bdev; - if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; - list_del_rcu(&tgtdev->dev_list); + btrfs_assign_next_active_device(fs_info, tgtdev, NULL); - call_rcu(&tgtdev->rcu, free_device); + list_del_rcu(&tgtdev->dev_list); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + + /* + * The update_dev_time() with in btrfs_scratch_superblocks() + * may lead to a call to btrfs_show_devname() which will try + * to hold device_list_mutex. And here this device + * is already out of device list, so we don't have to hold + * the device_list_mutex lock. + */ + btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + call_rcu(&tgtdev->rcu, free_device); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, @@ -2102,6 +2102,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } /* + * Lookup a device given by device id, or the path if the id is 0. + */ +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device) +{ + int ret; + + if (devid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, devid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + if (!devpath || !devpath[0]) + return -EINVAL; + + ret = btrfs_find_device_missing_or_by_path(root, devpath, + device); + } + return ret; +} + +/* * does all the dirty work required for changing file system's UUID. */ static int btrfs_prepare_sprout(struct btrfs_root *root) @@ -2418,7 +2443,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2663,7 +2688,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_std_error(root->fs_info, -ENOENT, + btrfs_handle_fs_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2671,7 +2696,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2857,7 +2882,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } @@ -3402,6 +3427,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3540,7 +3566,13 @@ again: goto loop; } - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { + ASSERT(fs_info->data_sinfo); + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + !chunk_reserved && !bytes_used) { trans = btrfs_start_transaction(chunk_root, 0); if (IS_ERR(trans)) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); @@ -3632,7 +3664,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } @@ -3693,10 +3725,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, num_devices--; } btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices > 1) + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; + if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); if (num_devices > 2) allowed |= BTRFS_BLOCK_GROUP_RAID5; @@ -5278,7 +5308,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; - BUG_ON(offset < stripe_offset); + if (offset < stripe_offset) { + btrfs_crit(fs_info, "stripe math has gone wrong, " + "stripe_offset=%llu, offset=%llu, start=%llu, " + "logical=%llu, stripe_len=%llu", + stripe_offset, offset, em->start, logical, + stripe_len); + free_extent_map(em); + return -EINVAL; + } /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; @@ -5519,7 +5557,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, &stripe_index); mirror_num = stripe_index + 1; } - BUG_ON(stripe_index >= map->num_stripes); + if (stripe_index >= map->num_stripes) { + btrfs_crit(fs_info, "stripe index math went horribly wrong, " + "got stripe_index=%u, num_stripes=%u", + stripe_index, map->num_stripes); + ret = -EINVAL; + goto out; + } num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { @@ -6242,7 +6286,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk length %llu", length); return -EIO; } - if (!is_power_of_2(stripe_len)) { + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", stripe_len); return -EIO; @@ -6527,7 +6571,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) * but sb spans only this function. Add an explicit SetPageUptodate call * to silence the warning eg. on PowerPC 64. */ - if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..0ac90f8d85bd 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -340,14 +340,14 @@ struct btrfs_raid_attr { }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; - +extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { u64 type; int io_align; int io_width; - int stripe_len; + u64 stripe_len; int sector_size; int num_stripes; int sub_stripes; @@ -357,52 +357,6 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -/* - * Restriper's general type filter - */ -#define BTRFS_BALANCE_DATA (1ULL << 0) -#define BTRFS_BALANCE_SYSTEM (1ULL << 1) -#define BTRFS_BALANCE_METADATA (1ULL << 2) - -#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ - BTRFS_BALANCE_SYSTEM | \ - BTRFS_BALANCE_METADATA) - -#define BTRFS_BALANCE_FORCE (1ULL << 3) -#define BTRFS_BALANCE_RESUME (1ULL << 4) - -/* - * Balance filters - */ -#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) -#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) -#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) -#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) -#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) -#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) -#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) -#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) -#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) - -#define BTRFS_BALANCE_ARGS_MASK \ - (BTRFS_BALANCE_ARGS_PROFILES | \ - BTRFS_BALANCE_ARGS_USAGE | \ - BTRFS_BALANCE_ARGS_DEVID | \ - BTRFS_BALANCE_ARGS_DRANGE | \ - BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT | \ - BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ - BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ - BTRFS_BALANCE_ARGS_USAGE_RANGE) - -/* - * Profile changing flags. When SOFT is set we won't relocate chunk if - * it already has the target profile (even though it may be - * half-filled). - */ -#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) -#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) - struct btrfs_balance_args; struct btrfs_balance_progress; struct btrfs_balance_control { @@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); -int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 145d2b89e62d..3bfb252206c7 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -237,6 +237,9 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + if (trans) return do_setxattr(trans, inode, name, value, size, flags); @@ -369,11 +372,9 @@ err: } static int btrfs_xattr_handler_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_getxattr(inode, name, buffer, size); } @@ -434,25 +435,6 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { NULL, }; -int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - - if (btrfs_root_readonly(root)) - return -EROFS; - return generic_setxattr(dentry, name, value, size, flags); -} - -int btrfs_removexattr(struct dentry *dentry, const char *name) -{ - struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - - if (btrfs_root_readonly(root)) - return -EROFS; - return generic_removexattr(dentry, name); -} - static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 96807b3d22f5..15fc4743dc70 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -28,9 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); -extern int btrfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -extern int btrfs_removexattr(struct dentry *dentry, const char *name); extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 82990b8f872b..88d274e8ecf2 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -59,7 +59,7 @@ static struct list_head *zlib_alloc_workspace(void) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); workspace->strm.workspace = vmalloc(workspacesize); - workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -103,7 +103,7 @@ static int zlib_compress_pages(struct list_head *ws, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap(in_page); out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); @@ -117,8 +117,8 @@ static int zlib_compress_pages(struct list_head *ws, workspace->strm.next_in = data_in; workspace->strm.next_out = cpage_out; - workspace->strm.avail_out = PAGE_CACHE_SIZE; - workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE); + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_in = min(len, PAGE_SIZE); while (workspace->strm.total_in < len) { ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); @@ -156,7 +156,7 @@ static int zlib_compress_pages(struct list_head *ws, cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; workspace->strm.next_out = cpage_out; } /* we're all done */ @@ -170,14 +170,14 @@ static int zlib_compress_pages(struct list_head *ws, bytes_left = len - workspace->strm.total_in; kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); - start += PAGE_CACHE_SIZE; + start += PAGE_SIZE; in_page = find_get_page(mapping, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); data_in = kmap(in_page); workspace->strm.avail_in = min(bytes_left, - PAGE_CACHE_SIZE); + PAGE_SIZE); workspace->strm.next_in = data_in; } } @@ -205,7 +205,7 @@ out: if (in_page) { kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); } return ret; } @@ -223,18 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, size_t total_out = 0; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long pg_offset; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; - workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -274,7 +274,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; if (workspace->strm.avail_in == 0) { unsigned long tmp; @@ -288,7 +288,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, - PAGE_CACHE_SIZE); + PAGE_SIZE); } } if (ret != Z_STREAM_END) @@ -325,7 +325,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, workspace->strm.total_in = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -368,8 +368,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, else buf_offset = 0; - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); + bytes = min(PAGE_SIZE - pg_offset, + PAGE_SIZE - buf_offset); bytes = min(bytes, bytes_left); kaddr = kmap_atomic(dest_page); @@ -380,7 +380,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, bytes_left -= bytes; next: workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; } if (ret != Z_STREAM_END && bytes_left != 0) diff --git a/fs/buffer.c b/fs/buffer.c index 33be29675358..754813a6962b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -129,7 +129,7 @@ __clear_page_buffers(struct page *page) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } static void buffer_io_error(struct buffer_head *bh, char *msg) @@ -207,7 +207,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) struct page *page; int all_mapped = 1; - index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); + index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); if (!page) goto out; @@ -245,7 +245,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) } out_unlock: spin_unlock(&bd_mapping->private_lock); - page_cache_release(page); + put_page(page); out: return ret; } @@ -255,17 +255,17 @@ out: */ static void free_more_memory(void) { - struct zone *zone; + struct zoneref *z; int nid; wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { - (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL, - &zone); - if (zone) + + z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL); + if (z->zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, GFP_NOFS, NULL); } @@ -1040,7 +1040,7 @@ done: ret = (block < end_block) ? 1 : -ENXIO; failed: unlock_page(page); - page_cache_release(page); + put_page(page); return ret; } @@ -1533,7 +1533,7 @@ void block_invalidatepage(struct page *page, unsigned int offset, /* * Check for overflow */ - BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + BUG_ON(stop > PAGE_SIZE || stop < length); head = page_buffers(page); bh = head; @@ -1716,7 +1716,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, blocksize = bh->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + block = (sector_t)page->index << (PAGE_SHIFT - bbits); last_block = (i_size_read(inode) - 1) >> bbits; /* @@ -1894,7 +1894,7 @@ EXPORT_SYMBOL(page_zero_new_buffers); int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; struct inode *inode = page->mapping->host; unsigned block_start, block_end; @@ -1904,15 +1904,15 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; BUG_ON(!PageLocked(page)); - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > PAGE_SIZE); + BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); head = create_page_buffers(page, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + block = (sector_t)page->index << (PAGE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { @@ -2020,7 +2020,7 @@ static int __block_commit_write(struct inode *inode, struct page *page, int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, get_block_t *get_block) { - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; struct page *page; int status; @@ -2031,7 +2031,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, status = __block_write_begin(page, pos, len, get_block); if (unlikely(status)) { unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; } @@ -2047,7 +2047,7 @@ int block_write_end(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; unsigned start; - start = pos & (PAGE_CACHE_SIZE - 1); + start = pos & (PAGE_SIZE - 1); if (unlikely(copied < len)) { /* @@ -2099,7 +2099,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, } unlock_page(page); - page_cache_release(page); + put_page(page); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -2136,9 +2136,9 @@ int block_is_partially_uptodate(struct page *page, unsigned long from, head = page_buffers(page); blocksize = head->b_size; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); + to = min_t(unsigned, PAGE_SIZE - from, count); to = from + to; - if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) + if (from < blocksize && to > PAGE_SIZE - blocksize) return 0; bh = head; @@ -2181,7 +2181,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) blocksize = head->b_size; bbits = block_size_bits(blocksize); - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + iblock = (sector_t)page->index << (PAGE_SHIFT - bbits); lblock = (i_size_read(inode)+blocksize-1) >> bbits; bh = head; nr = 0; @@ -2295,16 +2295,16 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, unsigned zerofrom, offset, len; int err = 0; - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ~PAGE_CACHE_MASK; + index = pos >> PAGE_SHIFT; + offset = pos & ~PAGE_MASK; - while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { - zerofrom = curpos & ~PAGE_CACHE_MASK; + while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) { + zerofrom = curpos & ~PAGE_MASK; if (zerofrom & (blocksize-1)) { *bytes |= (blocksize-1); (*bytes)++; } - len = PAGE_CACHE_SIZE - zerofrom; + len = PAGE_SIZE - zerofrom; err = pagecache_write_begin(file, mapping, curpos, len, AOP_FLAG_UNINTERRUPTIBLE, @@ -2329,7 +2329,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, /* page covers the boundary, find the boundary offset */ if (index == curidx) { - zerofrom = curpos & ~PAGE_CACHE_MASK; + zerofrom = curpos & ~PAGE_MASK; /* if we will expand the thing last block will be filled */ if (offset <= zerofrom) { goto out; @@ -2375,7 +2375,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping, if (err) return err; - zerofrom = *bytes & ~PAGE_CACHE_MASK; + zerofrom = *bytes & ~PAGE_MASK; if (pos+len > *bytes && zerofrom & (blocksize-1)) { *bytes |= (blocksize-1); (*bytes)++; @@ -2430,10 +2430,10 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, } /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) - end = size & ~PAGE_CACHE_MASK; + if (((page->index + 1) << PAGE_SHIFT) > size) + end = size & ~PAGE_MASK; else - end = PAGE_CACHE_SIZE; + end = PAGE_SIZE; ret = __block_write_begin(page, 0, end, get_block); if (!ret) @@ -2508,8 +2508,8 @@ int nobh_write_begin(struct address_space *mapping, int ret = 0; int is_mapped_to_disk = 1; - index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); + index = pos >> PAGE_SHIFT; + from = pos & (PAGE_SIZE - 1); to = from + len; page = grab_cache_page_write_begin(mapping, index, flags); @@ -2543,7 +2543,7 @@ int nobh_write_begin(struct address_space *mapping, goto out_release; } - block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); /* * We loop across all blocks in the page, whether or not they are @@ -2551,7 +2551,7 @@ int nobh_write_begin(struct address_space *mapping, * page is fully mapped-to-disk. */ for (block_start = 0, block_in_page = 0, bh = head; - block_start < PAGE_CACHE_SIZE; + block_start < PAGE_SIZE; block_in_page++, block_start += blocksize, bh = bh->b_this_page) { int create; @@ -2623,7 +2623,7 @@ failed: out_release: unlock_page(page); - page_cache_release(page); + put_page(page); *pagep = NULL; return ret; @@ -2653,7 +2653,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping, } unlock_page(page); - page_cache_release(page); + put_page(page); while (head) { bh = head; @@ -2675,7 +2675,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block, { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + const pgoff_t end_index = i_size >> PAGE_SHIFT; unsigned offset; int ret; @@ -2684,7 +2684,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block, goto out; /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); + offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { /* * The page may have dirty, unmapped buffers. For example, @@ -2707,7 +2707,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); out: ret = mpage_writepage(page, get_block, wbc); if (ret == -EAGAIN) @@ -2720,8 +2720,8 @@ EXPORT_SYMBOL(nobh_writepage); int nobh_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + pgoff_t index = from >> PAGE_SHIFT; + unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize; sector_t iblock; unsigned length, pos; @@ -2738,7 +2738,7 @@ int nobh_truncate_page(struct address_space *mapping, return 0; length = blocksize - length; - iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); page = grab_cache_page(mapping, index); err = -ENOMEM; @@ -2748,7 +2748,7 @@ int nobh_truncate_page(struct address_space *mapping, if (page_has_buffers(page)) { has_buffers: unlock_page(page); - page_cache_release(page); + put_page(page); return block_truncate_page(mapping, from, get_block); } @@ -2772,7 +2772,7 @@ has_buffers: if (!PageUptodate(page)) { err = mapping->a_ops->readpage(NULL, page); if (err) { - page_cache_release(page); + put_page(page); goto out; } lock_page(page); @@ -2789,7 +2789,7 @@ has_buffers: unlock: unlock_page(page); - page_cache_release(page); + put_page(page); out: return err; } @@ -2798,8 +2798,8 @@ EXPORT_SYMBOL(nobh_truncate_page); int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + pgoff_t index = from >> PAGE_SHIFT; + unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize; sector_t iblock; unsigned length, pos; @@ -2816,7 +2816,7 @@ int block_truncate_page(struct address_space *mapping, return 0; length = blocksize - length; - iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); page = grab_cache_page(mapping, index); err = -ENOMEM; @@ -2865,7 +2865,7 @@ int block_truncate_page(struct address_space *mapping, unlock: unlock_page(page); - page_cache_release(page); + put_page(page); out: return err; } @@ -2879,7 +2879,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block, { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + const pgoff_t end_index = i_size >> PAGE_SHIFT; unsigned offset; /* Is the page fully inside i_size? */ @@ -2888,14 +2888,14 @@ int block_write_full_page(struct page *page, get_block_t *get_block, end_buffer_async_write); /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); + offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { /* * The page may have dirty, unmapped buffers. For example, * they may have been added in ext3_writepage(). Make them * freeable here, so the page does not leak. */ - do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + do_invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; /* don't care */ } @@ -2907,7 +2907,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); return __block_write_full_page(inode, page, get_block, wbc, end_buffer_async_write); } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c0f3da3926a0..afbdc418966d 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -194,10 +194,10 @@ static void cachefiles_read_copier(struct fscache_operation *_op) error = -EIO; } - page_cache_release(monitor->back_page); + put_page(monitor->back_page); fscache_end_io(op, monitor->netfs_page, error); - page_cache_release(monitor->netfs_page); + put_page(monitor->netfs_page); fscache_retrieval_complete(op, 1); fscache_put_retrieval(op); kfree(monitor); @@ -288,8 +288,8 @@ monitor_backing_page: _debug("- monitor add"); /* install the monitor */ - page_cache_get(monitor->netfs_page); - page_cache_get(backpage); + get_page(monitor->netfs_page); + get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; add_page_wait_queue(backpage, &monitor->monitor); @@ -310,7 +310,7 @@ backing_page_already_present: _debug("- present"); if (newpage) { - page_cache_release(newpage); + put_page(newpage); newpage = NULL; } @@ -342,7 +342,7 @@ success: out: if (backpage) - page_cache_release(backpage); + put_page(backpage); if (monitor) { fscache_put_retrieval(monitor->op); kfree(monitor); @@ -363,7 +363,7 @@ io_error: goto out; nomem_page: - page_cache_release(newpage); + put_page(newpage); nomem_monitor: fscache_put_retrieval(monitor->op); kfree(monitor); @@ -530,7 +530,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { - page_cache_release(netpage); + put_page(netpage); fscache_retrieval_complete(op, 1); continue; } @@ -538,10 +538,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, } /* install a monitor */ - page_cache_get(netpage); + get_page(netpage); monitor->netfs_page = netpage; - page_cache_get(backpage); + get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; add_page_wait_queue(backpage, &monitor->monitor); @@ -555,10 +555,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, unlock_page(backpage); } - page_cache_release(backpage); + put_page(backpage); backpage = NULL; - page_cache_release(netpage); + put_page(netpage); netpage = NULL; continue; @@ -603,7 +603,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { - page_cache_release(netpage); + put_page(netpage); fscache_retrieval_complete(op, 1); continue; } @@ -612,14 +612,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, copy_highpage(netpage, backpage); - page_cache_release(backpage); + put_page(backpage); backpage = NULL; fscache_mark_page_cached(op, netpage); /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); - page_cache_release(netpage); + put_page(netpage); netpage = NULL; fscache_retrieval_complete(op, 1); continue; @@ -632,11 +632,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, out: /* tidy up */ if (newpage) - page_cache_release(newpage); + put_page(newpage); if (netpage) - page_cache_release(netpage); + put_page(netpage); if (backpage) - page_cache_release(backpage); + put_page(backpage); if (monitor) { fscache_put_retrieval(op); kfree(monitor); @@ -644,7 +644,7 @@ out: list_for_each_entry_safe(netpage, _n, list, lru) { list_del(&netpage->lru); - page_cache_release(netpage); + put_page(netpage); fscache_retrieval_complete(op, 1); } diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index f19708487e2f..4f67227f69a5 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -37,6 +37,8 @@ static inline void ceph_set_cached_acl(struct inode *inode, spin_lock(&ci->i_ceph_lock); if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); spin_unlock(&ci->i_ceph_lock); } @@ -88,7 +90,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) char *value = NULL; struct iattr newattrs; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; - struct dentry *dentry; switch (type) { case ACL_TYPE_ACCESS: @@ -126,29 +127,26 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out_free; } - dentry = d_find_alias(inode); if (new_mode != old_mode) { newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; - ret = ceph_setattr(dentry, &newattrs); + ret = __ceph_setattr(inode, &newattrs); if (ret) - goto out_dput; + goto out_free; } - ret = __ceph_setxattr(dentry, name, value, size, 0); + ret = __ceph_setxattr(inode, name, value, size, 0); if (ret) { if (new_mode != old_mode) { newattrs.ia_mode = old_mode; newattrs.ia_valid = ATTR_MODE; - ceph_setattr(dentry, &newattrs); + __ceph_setattr(inode, &newattrs); } - goto out_dput; + goto out_free; } ceph_set_cached_acl(inode, type, acl); -out_dput: - dput(dentry); out_free: kfree(value); out: diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index fc5cae2a0db2..eeb71e5de27a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -143,7 +143,7 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, inode = page->mapping->host; ci = ceph_inode(inode); - if (offset != 0 || length != PAGE_CACHE_SIZE) { + if (offset != 0 || length != PAGE_SIZE) { dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", inode, page, page->index, offset, length); return; @@ -197,10 +197,10 @@ static int readpage_nounlock(struct file *filp, struct page *page) &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 off = page_offset(page); - u64 len = PAGE_CACHE_SIZE; + u64 len = PAGE_SIZE; if (off >= i_size_read(inode)) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); return 0; } @@ -212,7 +212,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) */ if (off == 0) return -EINVAL; - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); return 0; } @@ -234,9 +234,9 @@ static int readpage_nounlock(struct file *filp, struct page *page) ceph_fscache_readpage_cancel(inode, page); goto out; } - if (err < PAGE_CACHE_SIZE) + if (err < PAGE_SIZE) /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, err, PAGE_SIZE); else flush_dcache_page(page); @@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page) /* * Finish an async read(ahead) op. */ -static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) +static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_osd_data *osd_data; - int rc = req->r_result; - int bytes = le32_to_cpu(msg->hdr.data_len); + int rc = req->r_result <= 0 ? req->r_result : 0; + int bytes = req->r_result >= 0 ? req->r_result : 0; int num_pages; int i; @@ -278,10 +278,10 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) if (rc < 0 && rc != -ENOENT) goto unlock; - if (bytes < (int)PAGE_CACHE_SIZE) { + if (bytes < (int)PAGE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; - zero_user_segment(page, s, PAGE_CACHE_SIZE); + zero_user_segment(page, s, PAGE_SIZE); } dout("finish_read %p uptodate %p idx %lu\n", inode, page, page->index); @@ -290,8 +290,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) ceph_readpage_to_fscache(inode, page); unlock: unlock_page(page); - page_cache_release(page); - bytes -= PAGE_CACHE_SIZE; + put_page(page); + bytes -= PAGE_SIZE; } kfree(osd_data->pages); } @@ -336,7 +336,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) if (max && nr_pages == max) break; } - len = nr_pages << PAGE_CACHE_SHIFT; + len = nr_pages << PAGE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, off, len); vino = ceph_vino(inode); @@ -364,7 +364,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_KERNEL)) { ceph_fscache_uncache_page(inode, page); - page_cache_release(page); + put_page(page); dout("start_read %p add_to_page_cache failed %p\n", inode, page); nr_pages = i; @@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -415,8 +413,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; - if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) - max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + if (fsc->mount_options->rsize >= PAGE_SIZE) + max = (fsc->mount_options->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; dout("readpages %p file %p nr_pages %d max %d\n", inode, @@ -484,7 +482,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) long writeback_stat; u64 truncate_size; u32 truncate_seq; - int err = 0, len = PAGE_CACHE_SIZE; + int err = 0, len = PAGE_SIZE; dout("writepage %p idx %lu\n", page, page->index); @@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { - dout("writepage setting page/mapping error %d %p\n", err, page); + struct writeback_control tmp_wbc; + if (!wbc) + wbc = &tmp_wbc; + if (err == -ERESTARTSYS) { + /* killed by SIGKILL */ + dout("writepage interrupted page %p\n", page); + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + goto out; + } + dout("writepage setting page/mapping error %d %p\n", + err, page); SetPageError(page); mapping_set_error(&inode->i_data, err); - if (wbc) - wbc->pages_skipped++; + wbc->pages_skipped++; } else { dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ @@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) BUG_ON(!inode); ihold(inode); err = writepage_nounlock(page, wbc); + if (err == -ERESTARTSYS) { + /* direct memory reclaimer was killed by SIGKILL. return 0 + * to prevent caller from setting mapping/page error */ + err = 0; + } unlock_page(page); iput(inode); return err; } - /* * lame release_pages helper. release_pages() isn't exported to * modules. @@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num) * If we get an error, set the mapping error bit, but not the individual * page error bits. */ -static void writepages_finish(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); @@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_fs_client *fsc = ceph_inode_to_client(inode); bool remove_page; - dout("writepages_finish %p rc %d\n", inode, rc); if (rc < 0) mapping_set_error(mapping, rc); @@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + if (rc < 0) + SetPageError(page); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); @@ -718,16 +731,19 @@ static int ceph_writepages_start(struct address_space *mapping, (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - pr_warn("writepage_start %p on forced umount\n", inode); - truncate_pagecache(inode, 0); + if (ci->i_wrbuffer_ref > 0) { + pr_warn_ratelimited( + "writepage_start %p %lld forced umount\n", + inode, ceph_ino(inode)); + } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - if (wsize < PAGE_CACHE_SIZE) - wsize = PAGE_CACHE_SIZE; - max_pages_ever = wsize >> PAGE_CACHE_SHIFT; + if (wsize < PAGE_SIZE) + wsize = PAGE_SIZE; + max_pages_ever = wsize >> PAGE_SHIFT; pagevec_init(&pvec, 0); @@ -737,8 +753,8 @@ static int ceph_writepages_start(struct address_space *mapping, end = -1; dout(" cyclic, start at %lu\n", start); } else { - start = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + start = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; should_loop = 0; @@ -887,7 +903,7 @@ get_more_pages: num_ops = 1 + do_sync; strip_unit_end = page->index + - ((len - 1) >> PAGE_CACHE_SHIFT); + ((len - 1) >> PAGE_SHIFT); BUG_ON(pages); max_pages = calc_pages_for(0, (u64)len); @@ -901,7 +917,7 @@ get_more_pages: len = 0; } else if (page->index != - (offset + len) >> PAGE_CACHE_SHIFT) { + (offset + len) >> PAGE_SHIFT) { if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS)) { redirty_page_for_writepage(wbc, page); @@ -929,7 +945,7 @@ get_more_pages: pages[locked_pages] = page; locked_pages++; - len += PAGE_CACHE_SIZE; + len += PAGE_SIZE; } /* did we get anything? */ @@ -981,7 +997,7 @@ new_request: BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); req->r_callback = writepages_finish; req->r_inode = inode; @@ -1011,7 +1027,7 @@ new_request: } set_page_writeback(pages[i]); - len += PAGE_CACHE_SIZE; + len += PAGE_SIZE; } if (snap_size != -1) { @@ -1020,7 +1036,7 @@ new_request: /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ - u64 min_len = len + 1 - PAGE_CACHE_SIZE; + u64 min_len = len + 1 - PAGE_SIZE; len = min(len, (u64)i_size_read(inode) - offset); len = max(len, min_len); } @@ -1063,10 +1079,7 @@ new_request: pages = NULL; } - vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, snapc, vino.snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1099,8 +1112,7 @@ release_pvec_pages: mapping->writeback_index = index; out: - if (req) - ceph_osdc_put_request(req); + ceph_osdc_put_request(req); ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; @@ -1134,14 +1146,21 @@ static int ceph_update_writeable_page(struct file *file, struct page *page) { struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - loff_t page_off = pos & PAGE_CACHE_MASK; - int pos_in_page = pos & ~PAGE_CACHE_MASK; + loff_t page_off = pos & PAGE_MASK; + int pos_in_page = pos & ~PAGE_MASK; int end_in_page = pos_in_page + len; loff_t i_size; int r; struct ceph_snap_context *snapc, *oldest; + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout(" page %p forced umount\n", page); + unlock_page(page); + return -EIO; + } + retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); @@ -1165,7 +1184,7 @@ retry_locked: snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); - r = wait_event_interruptible(ci->i_cap_wq, + r = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); if (r == -ERESTARTSYS) @@ -1191,7 +1210,7 @@ retry_locked: } /* full page? */ - if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) + if (pos_in_page == 0 && len == PAGE_SIZE) return 0; /* past end of file? */ @@ -1199,12 +1218,12 @@ retry_locked: if (page_off >= i_size || (pos_in_page == 0 && (pos+len) >= i_size && - end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { + end_in_page - pos_in_page != PAGE_SIZE)) { dout(" zeroing %p 0 - %d and %d - %d\n", - page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); + page, pos_in_page, end_in_page, (int)PAGE_SIZE); zero_user_segments(page, 0, pos_in_page, - end_in_page, PAGE_CACHE_SIZE); + end_in_page, PAGE_SIZE); return 0; } @@ -1228,7 +1247,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = file_inode(file); struct page *page; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; int r; do { @@ -1242,7 +1261,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, r = ceph_update_writeable_page(file, pos, len, page); if (r < 0) - page_cache_release(page); + put_page(page); else *pagep = page; } while (r == -EAGAIN); @@ -1259,7 +1278,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned from = pos & (PAGE_SIZE - 1); int check_cap = 0; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, @@ -1279,7 +1298,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); @@ -1292,8 +1311,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, * intercept O_DIRECT reads and writes early, this function should * never get called. */ -static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos) +static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter) { WARN_ON(1); return -EINVAL; @@ -1312,6 +1330,17 @@ const struct address_space_operations ceph_aops = { .direct_IO = ceph_direct_io, }; +static void ceph_block_sigs(sigset_t *oldset) +{ + sigset_t mask; + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void ceph_restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} /* * vm ops @@ -1322,28 +1351,26 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct page *pinned_page = NULL; - loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; + loff_t off = vmf->pgoff << PAGE_SHIFT; int want, got, ret; + sigset_t oldset; + + ceph_block_sigs(&oldset); dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", - inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); + inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, - -1, &got, &pinned_page); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - return VM_FAULT_SIGBUS; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + if (ret < 0) + goto out_restore; + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", - inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || ci->i_inline_version == CEPH_INLINE_NONE) @@ -1352,16 +1379,16 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ret = -EAGAIN; dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", - inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got), ret); if (pinned_page) - page_cache_release(pinned_page); + put_page(pinned_page); ceph_put_cap_refs(ci, got); if (ret != -EAGAIN) - return ret; + goto out_restore; /* read inline data */ - if (off >= PAGE_CACHE_SIZE) { + if (off >= PAGE_SIZE) { /* does not support inline data > PAGE_SIZE */ ret = VM_FAULT_SIGBUS; } else { @@ -1372,27 +1399,35 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; - goto out; + goto out_inline; } ret1 = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); if (ret1 < 0 || off >= i_size_read(inode)) { unlock_page(page); - page_cache_release(page); - ret = VM_FAULT_SIGBUS; - goto out; + put_page(page); + if (ret1 < 0) + ret = ret1; + else + ret = VM_FAULT_SIGBUS; + goto out_inline; } - if (ret1 < PAGE_CACHE_SIZE) - zero_user_segment(page, ret1, PAGE_CACHE_SIZE); + if (ret1 < PAGE_SIZE) + zero_user_segment(page, ret1, PAGE_SIZE); else flush_dcache_page(page); SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; +out_inline: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_SIZE, ret); } -out: - dout("filemap_fault %p %llu~%zd read inline data ret %d\n", - inode, off, (size_t)PAGE_CACHE_SIZE, ret); +out_restore: + ceph_restore_sigs(&oldset); + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return ret; } @@ -1410,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size = i_size_read(inode); size_t len; int want, got, ret; + sigset_t oldset; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) - return VM_FAULT_SIGBUS; + return VM_FAULT_OOM; + + ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; @@ -1424,16 +1462,14 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) { - ret = VM_FAULT_SIGBUS; + if (ret < 0) goto out_free; - } } - if (off + PAGE_CACHE_SIZE <= size) - len = PAGE_CACHE_SIZE; + if (off + PAGE_SIZE <= size) + len = PAGE_SIZE; else - len = size & ~PAGE_CACHE_MASK; + len = size & ~PAGE_MASK; dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", inode, ceph_vinop(inode), off, len, size); @@ -1441,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - ret = VM_FAULT_SIGBUS; - goto out_free; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); + if (ret < 0) + goto out_free; + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", inode, off, len, ceph_cap_string(got)); /* Update time before taking page lock */ file_update_time(vma->vm_file); - lock_page(page); + do { + lock_page(page); - ret = VM_FAULT_NOPAGE; - if ((off > size) || - (page->mapping != inode->i_mapping)) { - unlock_page(page); - goto out; - } + if ((off > size) || (page->mapping != inode->i_mapping)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + break; + } + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret >= 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + ret = VM_FAULT_LOCKED; + } + } while (ret == -EAGAIN); - ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret >= 0) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); - ret = VM_FAULT_LOCKED; - } else { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; - } -out: if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; @@ -1496,8 +1523,10 @@ out: inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); out_free: + ceph_restore_sigs(&oldset); ceph_free_cap_flush(prealloc_cf); - + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; return ret; } @@ -1519,7 +1548,7 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, return; if (PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); return; } } @@ -1534,14 +1563,14 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } if (page != locked_page) { - if (len < PAGE_CACHE_SIZE) - zero_user_segment(page, len, PAGE_CACHE_SIZE); + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); else flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); - page_cache_release(page); + put_page(page); } } @@ -1578,7 +1607,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) from_pagecache = true; lock_page(page); } else { - page_cache_release(page); + put_page(page); page = NULL; } } @@ -1586,8 +1615,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (page) { len = i_size_read(inode); - if (len > PAGE_CACHE_SIZE) - len = PAGE_CACHE_SIZE; + if (len > PAGE_SIZE) + len = PAGE_SIZE; } else { page = __page_cache_alloc(GFP_NOFS); if (!page) { @@ -1615,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1658,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1670,7 +1699,7 @@ out: if (page && page != locked_page) { if (from_pagecache) { unlock_page(page); - page_cache_release(page); + put_page(page); } else __free_pages(page, 0); } @@ -1759,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; - snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), - "%llx.00000000", ci->i_vino.ino); - rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); + + err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); + if (err) + goto out_unlock; wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); @@ -1770,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); - wr_req->r_base_oloc.pool = pool; - wr_req->r_base_oid = rd_req->r_base_oid; + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); + ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); + + err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); + if (err) + goto out_unlock; /* one page should be large enough for STAT data */ pages = ceph_alloc_page_vector(1, GFP_KERNEL); @@ -1785,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); + wr_req->r_mtime = ci->vfs_inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) @@ -1824,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) out_unlock: up_write(&mdsc->pool_perm_rwsem); - if (rd_req) - ceph_osdc_put_request(rd_req); - if (wr_req) - ceph_osdc_put_request(wr_req); + ceph_osdc_put_request(rd_req); + ceph_osdc_put_request(wr_req); out: if (!err) err = have; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a351480dbabc..c052b5bf219b 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int unlock_page(page); } -static inline int cache_valid(struct ceph_inode_info *ci) +static inline bool cache_valid(struct ceph_inode_info *ci) { return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && (ci->i_fscache_gen == ci->i_rdcache_gen)); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index de17bb232ff8..c17b5d76d75e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1656,7 +1656,7 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ @@ -1698,8 +1698,8 @@ retry_locked: revoking = cap->implemented & ~cap->issued; dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", - cap->mds, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap_used), + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -2317,7 +2317,7 @@ again: /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) == 0) { + if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; @@ -2412,12 +2412,26 @@ again: goto out_unlock; } - if (!__ceph_is_any_caps(ci) && - ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; - goto out_unlock; + if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { + int mds_wanted; + if (ACCESS_ONCE(mdsc->fsc->mount_state) == + CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci); + if ((mds_wanted & need) != need) { + dout("get_cap_refs %p caps were dropped" + " (session killed?)\n", inode); + *err = -ESTALE; + ret = 1; + goto out_unlock; + } + if ((mds_wanted & file_wanted) == + (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } dout("get_cap_refs %p have %s needed %s\n", inode, @@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (err == -EAGAIN) continue; if (err < 0) - return err; + ret = err; } else { ret = wait_event_interruptible(ci->i_cap_wq, try_get_cap_refs(ci, need, want, endoff, @@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, continue; if (err < 0) ret = err; - if (ret < 0) - return ret; + } + if (ret < 0) { + if (err == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; + } + return ret; } if (ci->i_inline_version != CEPH_INLINE_NONE && @@ -2510,7 +2531,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, *pinned_page = page; break; } - page_cache_release(page); + put_page(page); } /* * drop cap refs first because getattr while @@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - !ci->i_wrbuffer_ref) { + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ @@ -3226,6 +3247,8 @@ retry: if (target < 0) { __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; goto out_unlock; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 31f831471ed2..39ff678e567f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); kfree(path); - } else if (req->r_path2) { + } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, req->r_path2); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index fadc243dfb28..6e0fedf6713b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -5,6 +5,7 @@ #include <linux/namei.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/xattr.h> #include "super.h" #include "mds_client.h" @@ -69,16 +70,42 @@ out_unlock: } /* - * for readdir, we encode the directory frag and offset within that - * frag into f_pos. + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) +{ + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} + +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} + static unsigned fpos_frag(loff_t p) { - return p >> 32; + return p >> OFFSET_BITS; } + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + static unsigned fpos_off(loff_t p) { - return p & 0xffffffff; + return p & OFFSET_MASK; } static int fpos_cmp(loff_t l, loff_t r) @@ -110,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } + +static struct dentry * +__dcache_find_get_entry(struct dentry *parent, u64 idx, + struct ceph_readdir_cache_control *cache_ctl) +{ + struct inode *dir = d_inode(parent); + struct dentry *dentry; + unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; + loff_t ptr_pos = idx * sizeof(struct dentry *); + pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; + + if (ptr_pos >= i_size_read(dir)) + return NULL; + + if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + ceph_readdir_cache_release(cache_ctl); + cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); + if (!cache_ctl->page) { + dout(" page %lu not found\n", ptr_pgoff); + return ERR_PTR(-EAGAIN); + } + /* reading/filling the cache are serialized by + i_mutex, no need to use page lock */ + unlock_page(cache_ctl->page); + cache_ctl->dentries = kmap(cache_ctl->page); + } + + cache_ctl->index = idx & idx_mask; + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) + dentry = cache_ctl->dentries[cache_ctl->index]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + return dentry ? : ERR_PTR(-EAGAIN); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -129,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct inode *dir = d_inode(parent); struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; - unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *); - int err = 0; - loff_t ptr_pos = 0; struct ceph_readdir_cache_control cache_ctl = {}; + u64 idx = 0; + int err = 0; - dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); + dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); + + /* search start position */ + if (ctx->pos > 2) { + u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); + while (count > 0) { + u64 step = count >> 1; + dentry = __dcache_find_get_entry(parent, idx + step, + &cache_ctl); + if (!dentry) { + /* use linar search */ + idx = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (fpos_cmp(di->offset, ctx->pos) < 0) { + idx += step + 1; + count -= step + 1; + } else { + count = step; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + } - /* we can calculate cache index for the first dirfrag */ - if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { - cache_ctl.index = fpos_off(ctx->pos) - 2; - BUG_ON(cache_ctl.index < 0); - ptr_pos = cache_ctl.index * sizeof(struct dentry *); + dout("__dcache_readdir %p cache idx %llu\n", dir, idx); } - while (true) { - pgoff_t pgoff; - bool emit_dentry; - if (ptr_pos >= i_size_read(dir)) { + for (;;) { + bool emit_dentry = false; + dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); + if (!dentry) { fi->flags |= CEPH_F_ATEND; err = 0; break; } - - err = -EAGAIN; - pgoff = ptr_pos >> PAGE_CACHE_SHIFT; - if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { - ceph_readdir_cache_release(&cache_ctl); - cache_ctl.page = find_lock_page(&dir->i_data, pgoff); - if (!cache_ctl.page) { - dout(" page %lu not found\n", pgoff); - break; - } - /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ - unlock_page(cache_ctl.page); - cache_ctl.dentries = kmap(cache_ctl.page); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; } - rcu_read_lock(); - spin_lock(&parent->d_lock); - /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ - if (ceph_dir_is_complete_ordered(dir) && - ptr_pos < i_size_read(dir)) - dentry = cache_ctl.dentries[cache_ctl.index % nsize]; - else - dentry = NULL; - spin_unlock(&parent->d_lock); - if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) - dentry = NULL; - rcu_read_unlock(); - if (!dentry) - break; - - emit_dentry = false; di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && d_really_is_positive(dentry) && - ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && - ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) { emit_dentry = true; } spin_unlock(&dentry->d_lock); if (emit_dentry) { - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dout(" %llx dentry %p %pd %p\n", di->offset, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, @@ -217,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, } else { dput(dentry); } - - cache_ctl.index++; - ptr_pos += sizeof(struct dentry *); } +out: ceph_readdir_cache_release(&cache_ctl); if (last) { int ret; @@ -234,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, return err; } +static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +{ + if (!fi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + else + return fi->frag != fpos_frag(pos); +} + static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -241,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(ctx->pos); - int off = fpos_off(ctx->pos); + int i; int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); if (fi->flags & CEPH_F_ATEND) return 0; @@ -259,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 1; - off = 1; } if (ctx->pos == 1) { ino_t ino = parent_ino(file->f_path.dentry); @@ -269,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 2; - off = 2; } /* can we use the dcache? */ @@ -284,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; - frag = fpos_frag(ctx->pos); - off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -293,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (fi->frag != frag || fi->last_readdir == NULL) { + if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; + unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -304,6 +372,13 @@ more: fi->last_readdir = NULL; } + if (is_hash_order(ctx->pos)) { + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -330,6 +405,8 @@ more: req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); req->r_inode = inode; ihold(inode); @@ -339,22 +416,26 @@ more: ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d" - " on frag %x, end=%d, complete=%d\n", err, frag, + dout("readdir got and parsed readdir result=%d on " + "frag %x, end=%d, complete=%d, hash_order=%d\n", + err, frag, (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete); - + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); - /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - off = req->r_readdir_offset; - fi->next_offset = off; + if (!rinfo->hash_order) { + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + fi->next_offset, + false); + } } fi->frag = frag; - fi->offset = fi->next_offset; fi->last_readdir = req; if (req->r_did_prepopulate) { @@ -362,7 +443,8 @@ more: if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ fi->dir_ordered_count = 0; - } else if (ceph_frag_is_leftmost(frag) && off == 2) { + } else if (ceph_frag_is_leftmost(frag) && + fi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ fi->dir_release_count = req->r_dir_release_cnt; @@ -376,65 +458,87 @@ more: fi->dir_release_count = 0; } - if (req->r_reply_info.dir_end) { - kfree(fi->last_name); - fi->last_name = NULL; - if (ceph_frag_is_rightmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - } else { - err = note_last_dentry(fi, - rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1], - fi->next_offset + rinfo->dir_nr); + /* note next offset and last dentry name */ + if (rinfo->dir_nr > 0) { + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); + err = note_last_dentry(fi, rde->name, rde->name_len, + next_offset); if (err) return err; + } else if (req->r_reply_info.dir_end) { + fi->next_offset = 2; + /* keep last name */ } } rinfo = &fi->last_readdir->r_reply_info; - dout("readdir frag %x num %d off %d chunkoff %d\n", frag, - rinfo->dir_nr, off, fi->offset); - - ctx->pos = ceph_make_fpos(frag, off); - while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_inode *in = - rinfo->dir_in[off - fi->offset].in; + dout("readdir frag %x num %d pos %llx chunk first %llx\n", + fi->frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); + + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; - dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, ctx->pos, - rinfo->dir_dname_len[off - fi->offset], - rinfo->dir_dname[off - fi->offset], in); - BUG_ON(!in); - ftype = le32_to_cpu(in->mode) >> 12; - vino.ino = le64_to_cpu(in->ino); - vino.snap = le64_to_cpu(in->snapid); + BUG_ON(rde->offset < ctx->pos); + + ctx->pos = rde->offset; + dout("readdir (%d/%d) -> %llx '%.*s' %p\n", + i, rinfo->dir_nr, ctx->pos, + rde->name_len, rde->name, &rde->inode.in); + + BUG_ON(!rde->inode.in); + ftype = le32_to_cpu(rde->inode.in->mode) >> 12; + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); ino = ceph_vino_to_ino(vino); - if (!dir_emit(ctx, - rinfo->dir_dname[off - fi->offset], - rinfo->dir_dname_len[off - fi->offset], - ceph_translate_ino(inode->i_sb, ino), ftype)) { + + if (!dir_emit(ctx, rde->name, rde->name_len, + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } - off++; ctx->pos++; } - if (fi->last_name) { + if (fi->next_offset > 2) { ceph_mdsc_put_request(fi->last_readdir); fi->last_readdir = NULL; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(frag)) { - frag = ceph_frag_next(frag); - off = 0; - ctx->pos = ceph_make_fpos(frag, off); + if (!ceph_frag_is_rightmost(fi->frag)) { + unsigned frag = ceph_frag_next(fi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + fi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); + kfree(fi->last_name); + fi->last_name = NULL; + } dout("readdir next frag is %x\n", frag); goto more; } @@ -466,7 +570,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi, unsigned frag) +static void reset_readdir(struct ceph_file_info *fi) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -476,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) fi->last_name = NULL; fi->dir_release_count = 0; fi->readdir_cache_idx = -1; - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; /* compensate for . and .. */ - else - fi->next_offset = 0; + fi->next_offset = 2; /* compensate for . and .. */ fi->flags &= ~CEPH_F_ATEND; } +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; + if (new_pos == 0) + return true; + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sotred in hash order */ + } else if (fi->frag |= fpos_frag(new_pos)) { + return true; + } + rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); +} + static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; inode_lock(inode); @@ -504,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { + if (need_reset_readdir(fi, offset)) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; + } + if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; fi->flags &= ~CEPH_F_ATEND; } retval = offset; - - if (offset == 0 || - fpos_frag(offset) != fi->frag || - fpos_off(offset) < fi->offset) { - /* discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk */ - dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi, fpos_frag(offset)); - } else if (fpos_cmp(offset, old_offset) > 0) { - /* reset dir_release_count if we did a forward seek */ - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; - } } out: inode_unlock(inode); @@ -590,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, return dentry; } -static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) { return ceph_ino(inode) == CEPH_INO_ROOT && strncmp(dentry->d_name.name, ".ceph", 5) == 0; @@ -1342,10 +1463,10 @@ const struct inode_operations ceph_dir_iops = { .permission = ceph_permission, .getattr = ceph_getattr, .setattr = ceph_setattr, - .setxattr = ceph_setxattr, - .getxattr = ceph_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ceph_listxattr, - .removexattr = ceph_removexattr, + .removexattr = generic_removexattr, .get_acl = ceph_get_acl, .set_acl = ceph_set_acl, .mknod = ceph_mknod, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ef38f01c1795..a888df6f2d71 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -192,6 +192,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + dout("renew caps %p want %s issued %s updating mds_wanted\n", + inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + ceph_check_caps(ci, 0, NULL); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_fmode = -1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + dout("renew caps %p open result=%d\n", inode, err); + return err < 0 ? err : 0; +} + +/* * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -466,7 +519,7 @@ more: ret += zlen; } - didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; + didpages = (page_align + ret) >> PAGE_SHIFT; pos += ret; read = pos - off; left -= ret; @@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode, kfree(aio_req); } -static void ceph_aio_complete_req(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_aio_complete_req(struct ceph_osd_request *req) { int rc = req->r_result; struct inode *inode = req->r_inode; @@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - req->r_base_oloc = orig_req->r_base_oloc; - req->r_base_oid = orig_req->r_base_oid; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); + ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) { + ceph_osdc_put_request(req); + req = orig_req; + goto out; + } req->r_ops[0] = orig_req->r_ops[0]; osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - ceph_osdc_build_request(req, req->r_ops[0].extent.offset, - snapc, CEPH_NOSNAP, &aio_req->mtime); + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; ceph_osdc_put_request(orig_req); @@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work) out: if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } ceph_put_snap_context(snapc); @@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) list_add_tail(&req->r_unsafe_item, &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); + + complete_all(&req->r_completion); } else { spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_item); @@ -806,8 +867,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write) { ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + count) >> PAGE_CACHE_SHIFT); + pos >> PAGE_SHIFT, + (pos + count) >> PAGE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); @@ -872,17 +933,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, * may block. */ truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE - 1)); + (pos+len) | (PAGE_SIZE - 1)); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, false, false); - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; @@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req, false); if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } } return -EIOCBQUEUED; @@ -1006,8 +1065,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + count) >> PAGE_CACHE_SHIFT); + pos >> PAGE_SHIFT, + (pos + count) >> PAGE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); @@ -1036,7 +1095,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, * write from beginning of first page, * regardless of io alignment */ - num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { @@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, true); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - + req->r_mtime = mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1159,7 +1216,7 @@ again: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); if (pinned_page) { - page_cache_release(pinned_page); + put_page(pinned_page); pinned_page = NULL; } ceph_put_cap_refs(ci, got); @@ -1188,10 +1245,10 @@ again: if (retry_op == READ_INLINE) { BUG_ON(ret > 0 || read > 0); if (iocb->ki_pos < i_size && - iocb->ki_pos < PAGE_CACHE_SIZE) { + iocb->ki_pos < PAGE_SIZE) { loff_t end = min_t(loff_t, i_size, iocb->ki_pos + len); - end = min_t(loff_t, end, PAGE_CACHE_SIZE); + end = min_t(loff_t, end, PAGE_SIZE); if (statret < end) zero_user_segment(page, statret, end); ret = copy_page_to_iter(page, @@ -1382,12 +1439,11 @@ retry_snap: ceph_cap_string(got)); ceph_put_cap_refs(ci, got); - if (written >= 0 && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + written - 1, 1); - if (err < 0) - written = err; + if (written >= 0) { + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)) + iocb->ki_flags |= IOCB_DSYNC; + + written = generic_write_sync(iocb, written); } goto out_unlocked; @@ -1463,21 +1519,21 @@ static inline void ceph_zero_partial_page( struct inode *inode, loff_t offset, unsigned size) { struct page *page; - pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t index = offset >> PAGE_SHIFT; page = find_lock_page(inode->i_mapping, index); if (page) { wait_on_page_writeback(page); - zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + zero_user(page, offset & (PAGE_SIZE - 1), size); unlock_page(page); - page_cache_release(page); + put_page(page); } } static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, loff_t length) { - loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + loff_t nearly = round_up(offset, PAGE_SIZE); if (offset < nearly) { loff_t size = nearly - offset; if (length < size) @@ -1486,8 +1542,8 @@ static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, offset += size; length -= size; } - if (length >= PAGE_CACHE_SIZE) { - loff_t size = round_down(length, PAGE_CACHE_SIZE); + if (length >= PAGE_SIZE) { + loff_t size = round_down(length, PAGE_SIZE); truncate_pagecache_range(inode, offset, offset + size - 1); offset += size; length -= size; @@ -1525,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ed58b168904a..f059b5997072 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -8,8 +8,10 @@ #include <linux/kernel.h> #include <linux/writeback.h> #include <linux/vmalloc.h> +#include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/random.h> +#include <linux/sort.h> #include "super.h" #include "mds_client.h" @@ -92,10 +94,10 @@ const struct inode_operations ceph_file_iops = { .permission = ceph_permission, .setattr = ceph_setattr, .getattr = ceph_getattr, - .setxattr = ceph_setxattr, - .getxattr = ceph_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ceph_listxattr, - .removexattr = ceph_removexattr, + .removexattr = generic_removexattr, .get_acl = ceph_get_acl, .set_acl = ceph_set_acl, }; @@ -253,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode, diri_auth = ci->i_auth_cap->mds; spin_unlock(&ci->i_ceph_lock); + if (mds == -1) /* CDIR_AUTH_PARENT */ + mds = diri_auth; + mutex_lock(&ci->i_fragtree_mutex); if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ @@ -299,20 +304,38 @@ out: return err; } +static int frag_tree_split_cmp(const void *l, const void *r) +{ + struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; + struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; + return ceph_frag_compare(ls->frag, rs->frag); +} + +static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) +{ + if (!frag) + return f == ceph_frag_make(0, 0); + if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) + return false; + return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); +} + static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_inode_frag *frag; + struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; - int i; - u32 id, nsplits; + unsigned i, split_by, nsplits; + u32 id; bool update = false; mutex_lock(&ci->i_fragtree_mutex); nsplits = le32_to_cpu(fragtree->nsplits); - if (nsplits) { + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { i = prandom_u32() % nsplits; id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) @@ -331,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode, if (!update) goto out_unlock; + if (nsplits > 1) { + sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), + frag_tree_split_cmp, NULL); + } + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", ceph_vinop(inode), + i, nsplits, id, split_by); + continue; + } frag = NULL; while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); @@ -346,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode, break; } rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } frag = NULL; } if (!frag) { @@ -355,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode, if (IS_ERR(frag)) continue; } - frag->split_by = le32_to_cpu(fragtree->splits[i].by); + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; dout(" frag %x split by %d\n", frag->frag, frag->split_by); + prev_frag = frag; } while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } } out_unlock: mutex_unlock(&ci->i_fragtree_mutex); @@ -512,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode) rb_erase(n, &ci->i_fragtree); kfree(frag); } + ci->i_fragtree_nsplits = 0; __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) @@ -532,6 +583,11 @@ int ceph_drop_inode(struct inode *inode) return 1; } +static inline blkcnt_t calc_inode_blocks(u64 size) +{ + return (size + (1<<9) - 1) >> 9; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -554,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, size = 0; } i_size_write(inode, size); - inode->i_blocks = (size + (1<<9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -813,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); - err = -EINVAL; - if (WARN_ON(symlen != i_size_read(inode))) - goto out; + if (symlen != i_size_read(inode)) { + pr_err("fill_inode %llx.%llx BAD symlink " + "size %lld\n", ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } err = -ENOMEM; sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); @@ -1308,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; struct inode *in; int rc; - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); in = ceph_get_inode(req->r_dentry->d_sb, vino); if (IS_ERR(in)) { @@ -1321,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { pr_err("fill_inode badness on %p got %d\n", in, rc); err = rc; - continue; } + iput(in); } return err; @@ -1338,7 +1399,7 @@ void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) { if (ctl->page) { kunmap(ctl->page); - page_cache_release(ctl->page); + put_page(ctl->page); ctl->page = NULL; } } @@ -1348,7 +1409,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, struct ceph_mds_request *req) { struct ceph_inode_info *ci = ceph_inode(dir); - unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*); + unsigned nsize = PAGE_SIZE / sizeof(struct dentry*); unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; @@ -1367,7 +1428,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, unlock_page(ctl->page); ctl->dentries = kmap(ctl->page); if (idx == 0) - memset(ctl->dentries, 0, PAGE_CACHE_SIZE); + memset(ctl->dentries, 0, PAGE_SIZE); } if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && @@ -1386,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1393,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, skipped = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - struct ceph_dentry_info *di; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); + if (rinfo->hash_order && req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } + if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { dout("readdir_prepopulate got new frag %x -> %x\n", frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) + if (!rinfo->hash_order) req->r_readdir_offset = 2; - else - req->r_readdir_offset = 0; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { @@ -1426,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); req->r_readdir_cache_idx = 0; } cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; - dname.name = rinfo->dir_dname[i]; - dname.len = rinfo->dir_dname_len[i]; + dname.name = rde->name; + dname.len = rde->name_len; dname.hash = full_name_hash(dname.name, dname.len); - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); + + if (rinfo->hash_order) { + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + rde->name, rde->name_len); + hash = ceph_frag_value(hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } retry_lookup: dn = d_lookup(parent, &dname); @@ -1489,7 +1569,7 @@ retry_lookup: } } - ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + ret = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (ret < 0) { @@ -1522,11 +1602,9 @@ retry_lookup: dn = realdn; } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + ceph_dentry(dn)->offset = rde->offset; - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, + update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { @@ -1561,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); i_size_write(inode, size); - inode->i_blocks = (size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); /* tell the MDS if we are approaching max_size */ if ((size << 1) >= ci->i_max_size && @@ -1623,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work) struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_pg_inv_work); struct inode *inode = &ci->vfs_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); u32 orig_gen; int check = 0; mutex_lock(&ci->i_truncate_mutex); + + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", + inode, ceph_ino(inode)); + mapping_set_error(inode->i_mapping, -EIO); + truncate_pagecache(inode, 0); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); @@ -1640,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_pagecache(inode, 0); + if (invalidate_inode_pages2(inode->i_mapping) < 0) { + pr_err("invalidate_pages %p fails\n", inode); + } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1770,22 +1861,18 @@ static const struct inode_operations ceph_symlink_iops = { .get_link = simple_get_link, .setattr = ceph_setattr, .getattr = ceph_getattr, - .setxattr = ceph_setxattr, - .getxattr = ceph_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ceph_listxattr, - .removexattr = ceph_removexattr, + .removexattr = generic_removexattr, }; -/* - * setattr - */ -int ceph_setattr(struct dentry *dentry, struct iattr *attr) +int __ceph_setattr(struct inode *inode, struct iattr *attr) { - struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; int issued; int release = 0, dirtied = 0; @@ -1923,8 +2010,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); - inode->i_blocks = - (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(attr->ia_size); inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; @@ -2010,6 +2096,14 @@ out_put: } /* + * setattr + */ +int ceph_setattr(struct dentry *dentry, struct iattr *attr) +{ + return __ceph_setattr(d_inode(dentry), attr); +} + +/* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. */ diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f851d8d70158..be6b1657b1af 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; - down_read(&osdc->map_sem); + down_read(&osdc->lock); r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return -EIO; } dl.file_offset -= dl.object_offset; @@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ceph_ino(inode), dl.object_no); oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); - ceph_oid_set_name(&oid, dl.object_name); + ceph_oid_printf(&oid, "%s", dl.object_name); - r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return r; } - dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); @@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) } else { memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); } - up_read(&osdc->map_sem); + up_read(&osdc->lock); /* send result back to user */ if (copy_to_user(arg, &dl, sizeof(dl))) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 44852c3ae531..2103b823bec0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end, ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); - info->dir_end = ceph_decode_8(p); - info->dir_complete = ceph_decode_8(p); + { + u16 flags = ceph_decode_16(p); + info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); + info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + } if (num == 0) goto done; - BUG_ON(!info->dir_in); - info->dir_dname = (void *)(info->dir_in + num); - info->dir_dname_len = (void *)(info->dir_dname + num); - info->dir_dlease = (void *)(info->dir_dname_len + num); - if ((unsigned long)(info->dir_dlease + num) > - (unsigned long)info->dir_in + info->dir_buf_size) { + BUG_ON(!info->dir_entries); + if ((unsigned long)(info->dir_entries + num) > + (unsigned long)info->dir_entries + info->dir_buf_size) { pr_err("dir contents are larger than expected\n"); WARN_ON(1); goto bad; @@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_nr = num; while (num) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); - info->dir_dname_len[i] = ceph_decode_32(p); - ceph_decode_need(p, end, info->dir_dname_len[i], bad); - info->dir_dname[i] = *p; - *p += info->dir_dname_len[i]; - dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], - info->dir_dname[i]); - info->dir_dlease[i] = *p; + rde->name_len = ceph_decode_32(p); + ceph_decode_need(p, end, rde->name_len, bad); + rde->name = *p; + *p += rde->name_len; + dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + rde->lease = *p; *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i], features); + err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; i++; num--; } @@ -345,9 +348,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - if (!info->dir_in) + if (!info->dir_entries) return; - free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); + free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -386,9 +389,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) - ceph_auth_destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_auth.authorizer); + ceph_auth_destroy_authorizer(s->s_auth.authorizer); kfree(s); } } @@ -569,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req); } +DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) + /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ -static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, - u64 tid) +static struct ceph_mds_request * +lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; - struct rb_node *n = mdsc->request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mds_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else { - ceph_mdsc_get_request(req); - return req; - } - } - return NULL; -} -static void __insert_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *new) -{ - struct rb_node **p = &mdsc->request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mds_request *req = NULL; + req = lookup_request(&mdsc->request_tree, tid); + if (req) + ceph_mdsc_get_request(req); - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mds_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &mdsc->request_tree); + return req; } /* @@ -632,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); - __insert_request(mdsc, req); + insert_request(&mdsc->request_tree, req); req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); @@ -665,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } } - rb_erase(&req->r_node, &mdsc->request_tree); - RB_CLEAR_NODE(&req->r_node); + erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir && req->r_got_unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); @@ -870,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; void *p; const char* metadata[][2] = { {"hostname", utsname()->nodename}, {"kernel_version", utsname()->release}, - {"entity_id", opt->name ? opt->name : ""}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; @@ -1151,9 +1125,11 @@ out: static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { + struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - int drop = 0; + bool drop = false; + bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1161,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; + + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; + + if (ci->i_wrbuffer_ref > 0 && + ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + invalidate = true; while (true) { struct rb_node *n = rb_first(&ci->i_cap_flush_tree); @@ -1185,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = 1; + drop = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1195,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = 1; + drop = true; } spin_unlock(&mdsc->cap_dirty_lock); @@ -1212,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_del(&cf->list); ceph_free_cap_flush(cf); } - while (drop--) + + wake_up_all(&ci->i_cap_wq); + if (invalidate) + ceph_queue_invalidate(inode); + if (drop) iput(inode); return 0; } @@ -1222,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, */ static void remove_session_caps(struct ceph_mds_session *session) { + struct ceph_fs_client *fsc = session->s_mdsc->fsc; + struct super_block *sb = fsc->sb; dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, NULL); + iterate_session_caps(session, remove_session_caps_cb, fsc); spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { - struct super_block *sb = session->s_mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap, *prev = NULL; struct ceph_vino vino; @@ -1272,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } + wake_up_all(&ci->i_cap_wq); return 0; } @@ -1610,7 +1596,7 @@ again: while (!list_empty(&tmp_list)) { if (!msg) { msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, - PAGE_CACHE_SIZE, GFP_NOFS, false); + PAGE_SIZE, GFP_NOFS, false); if (!msg) goto out_err; head = msg->front.iov_base; @@ -1673,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; - size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + - sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + size_t size = sizeof(struct ceph_mds_reply_dir_entry); int order, num_entries; spin_lock(&ci->i_ceph_lock); @@ -1685,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, - order); - if (rinfo->dir_in) + rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, + order); + if (rinfo->dir_entries) break; order--; } - if (!rinfo->dir_in) + if (!rinfo->dir_entries) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; @@ -1724,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; kref_init(&req->r_kref); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); @@ -2416,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("handle_reply on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); @@ -2606,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); goto out; /* dup reply? */ @@ -3900,7 +3886,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &s->s_auth; if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(auth->authorizer); auth->authorizer = NULL; } if (!auth->authorizer) { diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 37712ccffcc6..e7d38aac7109 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in { u32 pool_ns_len; }; +struct ceph_mds_reply_dir_entry { + char *name; + u32 name_len; + struct ceph_mds_reply_lease *lease; + struct ceph_mds_reply_info_in inode; + loff_t offset; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + bool dir_complete; + bool dir_end; + bool hash_order; + struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ @@ -97,7 +104,7 @@ struct ceph_mds_reply_info_parsed { /* * cap releases are batched and sent to the MDS en masse. */ -#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ +#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \ sizeof(struct ceph_mds_cap_release)) / \ sizeof(struct ceph_mds_cap_item)) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 261531e55e9d..8c3591a7fbae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) const void *start = *p; int i, j, n; int err = -EINVAL; - u16 version; + u8 mdsmap_v, mdsmap_cv; m = kzalloc(sizeof(*m), GFP_NOFS); if (m == NULL) return ERR_PTR(-ENOMEM); - ceph_decode_16_safe(p, end, version, bad); - if (version > 3) { - pr_warn("got mdsmap version %d > 3, failing", version); - goto bad; + ceph_decode_need(p, end, 1 + 1, bad); + mdsmap_v = ceph_decode_8(p); + mdsmap_cv = ceph_decode_8(p); + if (mdsmap_v >= 4) { + u32 mdsmap_len; + ceph_decode_32_safe(p, end, mdsmap_len, bad); + if (end < *p + mdsmap_len) + goto bad; + end = *p + mdsmap_len; } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); @@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 namelen; s32 mds, inc, state; u64 state_seq; - u8 infoversion; + u8 info_v; + void *info_end = NULL; struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; - ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); - infoversion = ceph_decode_8(p); + info_v= ceph_decode_8(p); + if (info_v >= 4) { + u32 info_len; + u8 info_cv; + ceph_decode_need(p, end, 1 + sizeof(u32), bad); + info_cv = ceph_decode_8(p); + info_len = ceph_decode_32(p); + info_end = *p + info_len; + if (info_end > end) + goto bad; + } + + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); *p += sizeof(u64); namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; @@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; - if (infoversion >= 2) { + if (info_v >= 2) { ceph_decode_32_safe(p, end, num_export_targets, bad); pexport_targets = *p; *p += num_export_targets * sizeof(u32); @@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) num_export_targets = 0; } + if (info_end && *p != info_end) { + if (*p > info_end) + goto bad; + *p = info_end; + } + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), @@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ + *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c973043deb0e..91e02481ce06 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) * mount options */ enum { + Opt_mds_namespace, Opt_wsize, Opt_rsize, Opt_rasize, @@ -143,6 +144,7 @@ enum { }; static match_table_t fsopt_tokens = { + {Opt_mds_namespace, "mds_namespace=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_rasize, "rasize=%d"}, @@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ + case Opt_mds_namespace: + fsopt->mds_namespace = intval; + break; case Opt_wsize: fsopt->wsize = intval; break; @@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->server_path); kfree(args); } @@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } static int parse_mount_options(struct ceph_mount_options **pfsopt, struct ceph_options **popt, int flags, char *options, - const char *dev_name, - const char **path) + const char *dev_name) { struct ceph_mount_options *fsopt; const char *dev_name_end; @@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); + fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; /* * Distinguish the server list from the path in "dev_name". @@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - /* skip over leading '/' for path */ - *path = dev_name_end + 1; + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } else { - /* path is empty */ dev_name_end = dev_name + strlen(dev_name); - *path = dev_name_end; } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ @@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, goto out; } dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - dout("server path '%s'\n", *path); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); @@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif + if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) + seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, { struct ceph_fs_client *fsc; const u64 supported_features = - CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDS_INLINE_DATA; + CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; @@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); fsc->mount_options = fsopt; @@ -560,7 +574,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, /* set up mempools */ err = -ENOMEM; - page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; + page_count = fsc->mount_options->wsize >> PAGE_SHIFT; size = sizeof (struct page *) * (page_count ? page_count : 1); fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) @@ -785,8 +799,7 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, - const char *path) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto fail; } - if (path[0] == 0) { + if (!fsc->mount_options->server_path) { root = fsc->sb->s_root; dget(root); } else { - dout("mount opening base mountpoint\n"); + const char *path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -912,13 +926,13 @@ static int ceph_register_bdi(struct super_block *sb, int err; /* set ra_pages based on rasize mount option? */ - if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) + if (fsc->mount_options->rasize >= PAGE_SIZE) fsc->backing_dev_info.ra_pages = - (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) + (fsc->mount_options->rasize + PAGE_SIZE - 1) >> PAGE_SHIFT; else fsc->backing_dev_info.ra_pages = - VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; + VM_MAX_READAHEAD * 1024 / PAGE_SIZE; err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); @@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; @@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, #ifdef CONFIG_CEPH_FS_POSIX_ACL flags |= MS_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { res = ERR_PTR(err); goto out_final; @@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } } - res = ceph_real_mount(fsc, path); + res = ceph_real_mount(fsc); if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e705c4d612d7..0130a8592191 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,6 +62,7 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ + int mds_namespace; /* * everything above this point can be memcmp'd; everything below @@ -69,6 +70,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *server_path; /* default "/" */ }; struct ceph_fs_client { @@ -295,6 +297,7 @@ struct ceph_inode_info { u64 i_files, i_subdirs; struct rb_root i_fragtree; + int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; @@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) return (struct ceph_dentry_info *)dentry->d_fsdata; } -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; -} - /* * caps helpers */ @@ -632,7 +631,6 @@ struct ceph_file_info { struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ - unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; @@ -785,19 +783,15 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) return __ceph_do_getattr(inode, NULL, mask, force); } extern int ceph_permission(struct inode *inode, int mask); +extern int __ceph_setattr(struct inode *inode, struct iattr *attr); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); /* xattr.c */ -extern int ceph_setxattr(struct dentry *, const char *, const void *, - size_t, int); -int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); -int __ceph_removexattr(struct dentry *, const char *); -extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); -extern int ceph_removexattr(struct dentry *, const char *); extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern void __init ceph_xattr_init(void); @@ -931,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; +extern int ceph_renew_caps(struct inode *inode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, @@ -946,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9410abdef3ce..dacc1bd85629 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -16,6 +16,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr); +const struct xattr_handler ceph_other_xattr_handler; + /* * List of handlers for synthetic system.* attributes. Other * attributes are handled directly. @@ -25,6 +27,7 @@ const struct xattr_handler *ceph_xattr_handlers[] = { &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, #endif + &ceph_other_xattr_handler, NULL, }; @@ -33,7 +36,6 @@ static bool ceph_is_valid_xattr(const char *name) return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || - !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -75,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { size_t len = strlen(pool_name); @@ -107,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ret = -ERANGE; } } - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -141,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) ret = snprintf(val, size, "%s", pool_name); else ret = snprintf(val, size, "%lld", (unsigned long long)pool); - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -496,19 +498,6 @@ static int __remove_xattr(struct ceph_inode_info *ci, return 0; } -static int __remove_xattr_by_name(struct ceph_inode_info *ci, - const char *name) -{ - struct rb_node **p; - struct ceph_inode_xattr *xattr; - int err; - - p = &ci->i_xattrs.index.rb_node; - xattr = __get_xattr(ci, name); - err = __remove_xattr(ci, xattr); - return err; -} - static char *__copy_xattr_names(struct ceph_inode_info *ci, char *dest) { @@ -740,9 +729,6 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, int req_mask; int err; - if (!ceph_is_valid_xattr(name)) - return -ENODATA; - /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr) { @@ -804,15 +790,6 @@ out: return err; } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, - size_t size) -{ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, value, size); - - return __ceph_getxattr(d_inode(dentry), name, value, size); -} - ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = d_inode(dentry); @@ -877,15 +854,15 @@ out: return err; } -static int ceph_sync_setxattr(struct dentry *dentry, const char *name, +static int ceph_sync_setxattr(struct inode *inode, const char *name, const char *value, size_t size, int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *inode = d_inode(dentry); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_pagelist *pagelist = NULL; + int op = CEPH_MDS_OP_SETXATTR; int err; if (size > 0) { @@ -899,20 +876,21 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, if (err) goto out; } else if (!value) { - flags |= CEPH_XATTR_REMOVE; + if (flags & CEPH_XATTR_REPLACE) + op = CEPH_MDS_OP_RMXATTR; + else + flags |= CEPH_XATTR_REMOVE; } dout("setxattr value=%.*s\n", (int)size, value); /* do request */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, - USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { ceph_mdsc_put_request(req); @@ -920,8 +898,11 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, goto out; } - req->r_pagelist = pagelist; - pagelist = NULL; + if (op == CEPH_MDS_OP_SETXATTR) { + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_pagelist = pagelist; + pagelist = NULL; + } req->r_inode = inode; ihold(inode); @@ -939,13 +920,12 @@ out: return err; } -int __ceph_setxattr(struct dentry *dentry, const char *name, +int __ceph_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; @@ -958,8 +938,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, int required_blob_size; bool lock_snap_rwsem = false; - if (!ceph_is_valid_xattr(name)) - return -EOPNOTSUPP; + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; vxattr = ceph_match_vxattr(inode, name); if (vxattr && vxattr->readonly) @@ -1056,7 +1036,7 @@ do_sync_unlocked: "during filling trace\n", inode); err = -EBUSY; } else { - err = ceph_sync_setxattr(dentry, name, value, size, flags); + err = ceph_sync_setxattr(inode, name, value, size, flags); } out: ceph_free_cap_flush(prealloc_cf); @@ -1066,146 +1046,29 @@ out: return err; } -int ceph_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int ceph_get_xattr_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) { - if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) - return -EROFS; - - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return __ceph_setxattr(dentry, name, value, size, flags); -} - -static int ceph_send_removexattr(struct dentry *dentry, const char *name) -{ - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = d_inode(dentry); - struct ceph_mds_request *req; - int err; - - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, - USE_AUTH_MDS); - if (IS_ERR(req)) - return PTR_ERR(req); - req->r_path2 = kstrdup(name, GFP_NOFS); - if (!req->r_path2) - return -ENOMEM; - - req->r_inode = inode; - ihold(inode); - req->r_num_caps = 1; - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - err = ceph_mdsc_do_request(mdsc, NULL, req); - ceph_mdsc_put_request(req); - return err; + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __ceph_getxattr(inode, name, value, size); } -int __ceph_removexattr(struct dentry *dentry, const char *name) +static int ceph_set_xattr_handler(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - struct inode *inode = d_inode(dentry); - struct ceph_vxattr *vxattr; - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; - struct ceph_cap_flush *prealloc_cf = NULL; - int issued; - int err; - int required_blob_size; - int dirty; - bool lock_snap_rwsem = false; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - - vxattr = ceph_match_vxattr(inode, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - - /* pass any unhandled ceph.* xattrs through to the MDS */ - if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) - goto do_sync_unlocked; - - prealloc_cf = ceph_alloc_cap_flush(); - if (!prealloc_cf) - return -ENOMEM; - - err = -ENOMEM; - spin_lock(&ci->i_ceph_lock); -retry: - issued = __ceph_caps_issued(ci, NULL); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) - goto do_sync; - - if (!lock_snap_rwsem && !ci->i_head_snapc) { - lock_snap_rwsem = true; - if (!down_read_trylock(&mdsc->snap_rwsem)) { - spin_unlock(&ci->i_ceph_lock); - down_read(&mdsc->snap_rwsem); - spin_lock(&ci->i_ceph_lock); - goto retry; - } - } - - dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); - - __build_xattrs(inode); - - required_blob_size = __get_required_blob_size(ci, 0, 0); - - if (!ci->i_xattrs.prealloc_blob || - required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { - struct ceph_buffer *blob; - - spin_unlock(&ci->i_ceph_lock); - dout(" preaallocating new blob size=%d\n", required_blob_size); - blob = ceph_buffer_new(required_blob_size, GFP_NOFS); - if (!blob) - goto do_sync_unlocked; - spin_lock(&ci->i_ceph_lock); - if (ci->i_xattrs.prealloc_blob) - ceph_buffer_put(ci->i_xattrs.prealloc_blob); - ci->i_xattrs.prealloc_blob = blob; - goto retry; - } - - err = __remove_xattr_by_name(ceph_inode(inode), name); - - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, - &prealloc_cf); - ci->i_xattrs.dirty = true; - inode->i_ctime = current_fs_time(inode->i_sb); - spin_unlock(&ci->i_ceph_lock); - if (lock_snap_rwsem) - up_read(&mdsc->snap_rwsem); - if (dirty) - __mark_inode_dirty(inode, dirty); - ceph_free_cap_flush(prealloc_cf); - return err; -do_sync: - spin_unlock(&ci->i_ceph_lock); -do_sync_unlocked: - if (lock_snap_rwsem) - up_read(&mdsc->snap_rwsem); - ceph_free_cap_flush(prealloc_cf); - err = ceph_send_removexattr(dentry, name); - return err; + return __ceph_setxattr(d_inode(dentry), name, value, size, flags); } -int ceph_removexattr(struct dentry *dentry, const char *name) -{ - if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) - return -EROFS; - - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); - - return __ceph_removexattr(dentry, name); -} +const struct xattr_handler ceph_other_xattr_handler = { + .prefix = "", /* match any name => handlers called with full name */ + .get = ceph_get_xattr_handler, + .set = ceph_set_xattr_handler, +}; #ifdef CONFIG_SECURITY bool ceph_security_xattr_wanted(struct inode *in) diff --git a/fs/char_dev.c b/fs/char_dev.c index 24b142569ca9..687471dc04a0 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -91,6 +91,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, break; } + if (i < CHRDEV_MAJOR_DYN_END) + pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range", + name, i); + if (i == 0) { ret = -EBUSY; goto out; diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 1964d212ab08..eed7eb09f46f 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -5,9 +5,10 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ - cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ + cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o +cifs-$(CONFIG_CIFS_XATTR) += xattr.o cifs-$(CONFIG_CIFS_ACL) += cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index e956cba94338..ec9dbbcca3b9 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -151,8 +151,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (sb_mountdata == NULL) return ERR_PTR(-EINVAL); - if (strlen(fullpath) - ref->path_consumed) + if (strlen(fullpath) - ref->path_consumed) { prepath = fullpath + ref->path_consumed; + /* skip initial delimiter */ + if (*prepath == '/' || *prepath == '\\') + prepath++; + } *devname = cifs_build_devname(ref->node_name, prepath); if (IS_ERR(*devname)) { @@ -302,7 +306,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (full_path == NULL) goto cdda_exit; - cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb); + cifs_sb = CIFS_SB(mntpt->d_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { mnt = ERR_CAST(tlink); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 6908080e9b6d..b611fc2e8984 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -24,10 +24,13 @@ #include <linux/string.h> #include <keys/user-type.h> #include <linux/key-type.h> +#include <linux/keyctl.h> #include <linux/inet.h> #include "cifsglob.h" #include "cifs_spnego.h" #include "cifs_debug.h" +#include "cifsproto.h" +static const struct cred *spnego_cred; /* create a new cifs key */ static int @@ -102,6 +105,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) size_t desc_len; struct key *spnego_key; const char *hostname = server->hostname; + const struct cred *saved_cred; /* length of fields (with semicolons): ver=0xyz ip4=ipaddress host=hostname sec=mechanism uid=0xFF user=username */ @@ -163,7 +167,9 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) sprintf(dp, ";pid=0x%x", current->pid); cifs_dbg(FYI, "key description = %s\n", description); + saved_cred = override_creds(spnego_cred); spnego_key = request_key(&cifs_spnego_key_type, description, ""); + revert_creds(saved_cred); #ifdef CONFIG_CIFS_DEBUG2 if (cifsFYI && !IS_ERR(spnego_key)) { @@ -177,3 +183,64 @@ out: kfree(description); return spnego_key; } + +int +init_cifs_spnego(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + cifs_dbg(FYI, "Registering the %s key type\n", + cifs_spnego_key_type.name); + + /* + * Create an override credential set with special thread keyring for + * spnego upcalls. + */ + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = keyring_alloc(".cifs_spnego", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = register_key_type(&cifs_spnego_key_type); + if (ret < 0) + goto failed_put_key; + + /* + * instruct request_key() to use this special keyring as a cache for + * the results it looks up + */ + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + spnego_cred = cred; + + cifs_dbg(FYI, "cifs spnego keyring: %d\n", key_serial(keyring)); + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void +exit_cifs_spnego(void) +{ + key_revoke(spnego_cred->thread_keyring); + unregister_key_type(&cifs_spnego_key_type); + put_cred(spnego_cred); + cifs_dbg(FYI, "Unregistered %s key type\n", cifs_spnego_key_type.name); +} diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 3f93125916bf..71e8a56e9479 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -360,7 +360,7 @@ init_cifs_idmap(void) GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 4897dacf8944..6aeb8d4616a4 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -66,45 +66,15 @@ cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) return 0; } -/* - * Calculate and return the CIFS signature based on the mac key and SMB PDU. - * The 16 byte signature must be allocated by the caller. Note we only use the - * 1st eight bytes and that the smb header signature field on input contains - * the sequence number before this function is called. Also, this function - * should be called with the server->srv_mutex held. - */ -static int cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature) +int __cifs_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server, char *signature, + struct shash_desc *shash) { int i; int rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; - if (iov == NULL || signature == NULL || server == NULL) - return -EINVAL; - - if (!server->secmech.sdescmd5) { - rc = cifs_crypto_shash_md5_allocate(server); - if (rc) { - cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__); - return -1; - } - } - - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); - if (rc) { - cifs_dbg(VFS, "%s: Could not init md5\n", __func__); - return rc; - } - - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, - server->session_key.response, server->session_key.len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with response\n", __func__); - return rc; - } - for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; @@ -117,12 +87,10 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - rc = - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(shash, iov[i].iov_base + 4, iov[i].iov_len - 4); } else { - rc = - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(shash, iov[i].iov_base, iov[i].iov_len); } if (rc) { @@ -134,21 +102,64 @@ static int cifs_calc_signature(struct smb_rqst *rqst, /* now hash over the rq_pages array */ for (i = 0; i < rqst->rq_npages; i++) { - struct kvec p_iov; + void *kaddr = kmap(rqst->rq_pages[i]); + size_t len = rqst->rq_pagesz; + + if (i == rqst->rq_npages - 1) + len = rqst->rq_tailsz; + + crypto_shash_update(shash, kaddr, len); - cifs_rqst_page_to_kvec(rqst, i, &p_iov); - crypto_shash_update(&server->secmech.sdescmd5->shash, - p_iov.iov_base, p_iov.iov_len); kunmap(rqst->rq_pages[i]); } - rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + rc = crypto_shash_final(shash, signature); if (rc) - cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + cifs_dbg(VFS, "%s: Could not generate hash\n", __func__); return rc; } +/* + * Calculate and return the CIFS signature based on the mac key and SMB PDU. + * The 16 byte signature must be allocated by the caller. Note we only use the + * 1st eight bytes and that the smb header signature field on input contains + * the sequence number before this function is called. Also, this function + * should be called with the server->srv_mutex held. + */ +static int cifs_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server, char *signature) +{ + int rc; + + if (!rqst->rq_iov || !signature || !server) + return -EINVAL; + + if (!server->secmech.sdescmd5) { + rc = cifs_crypto_shash_md5_allocate(server); + if (rc) { + cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__); + return -1; + } + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init md5\n", __func__); + return rc; + } + + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + return rc; + } + + return __cifs_calc_signature(rqst, server, signature, + &server->secmech.sdescmd5->shash); +} + /* must be called with server->srv_mutex held */ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1d86fc620e5c..5d8b7edf8a8f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -37,6 +37,7 @@ #include <linux/freezer.h> #include <linux/namei.h> #include <linux/random.h> +#include <linux/xattr.h> #include <net/ipv6.h> #include "cifsfs.h" #include "cifspdu.h" @@ -135,6 +136,7 @@ cifs_read_super(struct super_block *sb) sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; + sb->s_xattr = cifs_xattr_handlers; sb->s_bdi = &cifs_sb->bdi; sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ @@ -888,44 +890,33 @@ const struct inode_operations cifs_dir_inode_ops = { .rmdir = cifs_rmdir, .rename2 = cifs_rename2, .permission = cifs_permission, -/* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .symlink = cifs_symlink, .mknod = cifs_mknod, -#ifdef CONFIG_CIFS_XATTR - .setxattr = cifs_setxattr, - .getxattr = cifs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = cifs_listxattr, - .removexattr = cifs_removexattr, -#endif + .removexattr = generic_removexattr, }; const struct inode_operations cifs_file_inode_ops = { -/* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, - .getattr = cifs_getattr, /* do we need this anymore? */ + .getattr = cifs_getattr, .permission = cifs_permission, -#ifdef CONFIG_CIFS_XATTR - .setxattr = cifs_setxattr, - .getxattr = cifs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = cifs_listxattr, - .removexattr = cifs_removexattr, -#endif + .removexattr = generic_removexattr, }; const struct inode_operations cifs_symlink_inode_ops = { .readlink = generic_readlink, .get_link = cifs_get_link, .permission = cifs_permission, - /* BB add the following two eventually */ - /* revalidate: cifs_revalidate, - setattr: cifs_notify_change, *//* BB do we need notify change */ -#ifdef CONFIG_CIFS_XATTR - .setxattr = cifs_setxattr, - .getxattr = cifs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = cifs_listxattr, - .removexattr = cifs_removexattr, -#endif + .removexattr = generic_removexattr, }; static int cifs_clone_file_range(struct file *src_file, loff_t off, @@ -962,7 +953,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, cifs_dbg(FYI, "about to flush pages\n"); /* should we flush first and last page first */ truncate_inode_pages_range(&target_inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len)-1); + PAGE_ALIGN(destoff + len)-1); if (target_tcon->ses->server->ops->duplicate_extents) rc = target_tcon->ses->server->ops->duplicate_extents(xid, @@ -1083,7 +1074,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { }; const struct file_operations cifs_dir_ops = { - .iterate = cifs_readdir, + .iterate_shared = cifs_readdir, .release = cifs_closedir, .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, @@ -1307,7 +1298,7 @@ init_cifs(void) goto out_destroy_mids; #ifdef CONFIG_CIFS_UPCALL - rc = register_key_type(&cifs_spnego_key_type); + rc = init_cifs_spnego(); if (rc) goto out_destroy_request_bufs; #endif /* CONFIG_CIFS_UPCALL */ @@ -1330,7 +1321,7 @@ out_init_cifs_idmap: out_register_key_type: #endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); + exit_cifs_spnego(); out_destroy_request_bufs: #endif cifs_destroy_request_bufs(); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 83aac8ba50b0..9dcf974acc47 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -120,15 +120,19 @@ extern const char *cifs_get_link(struct dentry *, struct inode *, struct delayed_call *); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname); -extern int cifs_removexattr(struct dentry *, const char *); -extern int cifs_setxattr(struct dentry *, const char *, const void *, - size_t, int); -extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); + +#ifdef CONFIG_CIFS_XATTR +extern const struct xattr_handler *cifs_xattr_handlers[]; extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); +#else +# define cifs_xattr_handlers NULL +# define cifs_listxattr NULL +#endif + extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); #ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.08" +#define CIFS_VERSION "2.09" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d21da9f05bae..bba106cdc43c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -615,8 +615,6 @@ struct TCP_Server_Info { bool sec_mskerberos; /* supports legacy MS Kerberos */ bool large_buf; /* is current buffer large? */ struct delayed_work echo; /* echo ping workqueue job */ - struct kvec *iov; /* reusable kvec array for receives */ - unsigned int nr_iov; /* number of kvecs in array */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ unsigned int total_read; /* total amount of data read in this pass */ @@ -714,7 +712,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) * * Note that this might make for "interesting" allocation problems during * writeback however as we have to allocate an array of pointers for the - * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * pages. A 16M write means ~32kb page array with PAGE_SIZE == 4096. * * For reads, there is a similar problem as we need to allocate an array * of kvecs to handle the receive, though that should only need to be done @@ -733,7 +731,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 - * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill + * pages in a single call. With PAGE_SIZE == 4k, this means we can fill * a single wsize request with a single call. */ #define CIFS_DEFAULT_IOSIZE (1024 * 1024) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index eed7ff50faf0..1243bd326591 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -37,8 +37,6 @@ extern void cifs_buf_release(void *); extern struct smb_hdr *cifs_small_buf_get(void); extern void cifs_small_buf_release(void *); extern void free_rsp_buf(int, void *); -extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, - struct kvec *iov); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); extern unsigned int _get_xid(void); @@ -60,6 +58,8 @@ do { \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); +extern int init_cifs_spnego(void); +extern void exit_cifs_spnego(void); extern char *build_path_from_dentry(struct dentry *); extern char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, @@ -181,10 +181,9 @@ extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, - unsigned int to_read); -extern int cifs_readv_from_socket(struct TCP_Server_Info *server, - struct kvec *iov_orig, unsigned int nr_segs, - unsigned int to_read); + unsigned int to_read); +extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, + struct page *page, unsigned int to_read); extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); @@ -512,4 +511,7 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_written); +int __cifs_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server, char *signature, + struct shash_desc *shash); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 76fcb50295a3..d47197ea4ab6 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1447,10 +1447,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) len = min_t(unsigned int, buflen, server->vals->read_rsp_size) - HEADER_SIZE(server) + 1; - rdata->iov.iov_base = buf + HEADER_SIZE(server) - 1; - rdata->iov.iov_len = len; - - length = cifs_readv_from_socket(server, &rdata->iov, 1, len); + length = cifs_read_from_socket(server, + buf + HEADER_SIZE(server) - 1, len); if (length < 0) return length; server->total_read += length; @@ -1502,9 +1500,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) len = data_offset - server->total_read; if (len > 0) { /* read any junk before data into the rest of smallbuf */ - rdata->iov.iov_base = buf + server->total_read; - rdata->iov.iov_len = len; - length = cifs_readv_from_socket(server, &rdata->iov, 1, len); + length = cifs_read_from_socket(server, + buf + server->total_read, len); if (length < 0) return length; server->total_read += length; @@ -1929,17 +1926,17 @@ cifs_writev_requeue(struct cifs_writedata *wdata) wsize = server->ops->wp_retry_size(inode); if (wsize < rest_len) { - nr_pages = wsize / PAGE_CACHE_SIZE; + nr_pages = wsize / PAGE_SIZE; if (!nr_pages) { rc = -ENOTSUPP; break; } - cur_len = nr_pages * PAGE_CACHE_SIZE; - tailsz = PAGE_CACHE_SIZE; + cur_len = nr_pages * PAGE_SIZE; + tailsz = PAGE_SIZE; } else { - nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE); + nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE); cur_len = rest_len; - tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE; + tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE; } wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); @@ -1957,7 +1954,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) wdata2->sync_mode = wdata->sync_mode; wdata2->nr_pages = nr_pages; wdata2->offset = page_offset(wdata2->pages[0]); - wdata2->pagesz = PAGE_CACHE_SIZE; + wdata2->pagesz = PAGE_SIZE; wdata2->tailsz = tailsz; wdata2->bytes = cur_len; @@ -1975,7 +1972,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) if (rc != 0 && rc != -EAGAIN) { SetPageError(wdata2->pages[j]); end_page_writeback(wdata2->pages[j]); - page_cache_release(wdata2->pages[j]); + put_page(wdata2->pages[j]); } } @@ -2018,7 +2015,7 @@ cifs_writev_complete(struct work_struct *work) else if (wdata->result < 0) SetPageError(page); end_page_writeback(page); - page_cache_release(page); + put_page(page); } if (wdata->result != -EAGAIN) mapping_set_error(inode->i_mapping, wdata->result); @@ -3366,7 +3363,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) return -EOPNOTSUPP; - if (acl_type & ACL_TYPE_ACCESS) { + if (acl_type == ACL_TYPE_ACCESS) { count = le16_to_cpu(cifs_acl->access_entry_count); pACE = &cifs_acl->ace_array[0]; size = sizeof(struct cifs_posix_acl); @@ -3377,7 +3374,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, size_of_data_area, size); return -EINVAL; } - } else if (acl_type & ACL_TYPE_DEFAULT) { + } else if (acl_type == ACL_TYPE_DEFAULT) { count = le16_to_cpu(cifs_acl->access_entry_count); size = sizeof(struct cifs_posix_acl); size += sizeof(struct cifs_posix_ace) * count; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a763cd3d9e7c..66736f57b5ab 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -501,99 +501,34 @@ server_unresponsive(struct TCP_Server_Info *server) return false; } -/* - * kvec_array_init - clone a kvec array, and advance into it - * @new: pointer to memory for cloned array - * @iov: pointer to original array - * @nr_segs: number of members in original array - * @bytes: number of bytes to advance into the cloned array - * - * This function will copy the array provided in iov to a section of memory - * and advance the specified number of bytes into the new array. It returns - * the number of segments in the new array. "new" must be at least as big as - * the original iov array. - */ -static unsigned int -kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs, - size_t bytes) -{ - size_t base = 0; - - while (bytes || !iov->iov_len) { - int copy = min(bytes, iov->iov_len); - - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - memcpy(new, iov, sizeof(*iov) * nr_segs); - new->iov_base += base; - new->iov_len -= base; - return nr_segs; -} - -static struct kvec * -get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs) -{ - struct kvec *new_iov; - - if (server->iov && nr_segs <= server->nr_iov) - return server->iov; - - /* not big enough -- allocate a new one and release the old */ - new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS); - if (new_iov) { - kfree(server->iov); - server->iov = new_iov; - server->nr_iov = nr_segs; - } - return new_iov; -} - -int -cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, - unsigned int nr_segs, unsigned int to_read) +static int +cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) { int length = 0; int total_read; - unsigned int segs; - struct msghdr smb_msg; - struct kvec *iov; - iov = get_server_iovec(server, nr_segs); - if (!iov) - return -ENOMEM; - - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; + smb_msg->msg_control = NULL; + smb_msg->msg_controllen = 0; - for (total_read = 0; to_read; total_read += length, to_read -= length) { + for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); - if (server_unresponsive(server)) { - total_read = -ECONNABORTED; - break; - } + if (server_unresponsive(server)) + return -ECONNABORTED; - segs = kvec_array_init(iov, iov_orig, nr_segs, total_read); + length = sock_recvmsg(server->ssocket, smb_msg, 0); - length = kernel_recvmsg(server->ssocket, &smb_msg, - iov, segs, to_read, 0); + if (server->tcpStatus == CifsExiting) + return -ESHUTDOWN; - if (server->tcpStatus == CifsExiting) { - total_read = -ESHUTDOWN; - break; - } else if (server->tcpStatus == CifsNeedReconnect) { + if (server->tcpStatus == CifsNeedReconnect) { cifs_reconnect(server); - total_read = -ECONNABORTED; - break; - } else if (length == -ERESTARTSYS || - length == -EAGAIN || - length == -EINTR) { + return -ECONNABORTED; + } + + if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { /* * Minimum sleep to prevent looping, allowing socket * to clear and app threads to set tcpStatus @@ -602,12 +537,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, usleep_range(1000, 2000); length = 0; continue; - } else if (length <= 0) { - cifs_dbg(FYI, "Received no data or error: expecting %d\n" - "got %d", to_read, length); + } + + if (length <= 0) { + cifs_dbg(FYI, "Received no data or error: %d\n", length); cifs_reconnect(server); - total_read = -ECONNABORTED; - break; + return -ECONNABORTED; } } return total_read; @@ -617,12 +552,21 @@ int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read) { - struct kvec iov; + struct msghdr smb_msg; + struct kvec iov = {.iov_base = buf, .iov_len = to_read}; + iov_iter_kvec(&smb_msg.msg_iter, READ | ITER_KVEC, &iov, 1, to_read); - iov.iov_base = buf; - iov.iov_len = to_read; + return cifs_readv_from_socket(server, &smb_msg); +} - return cifs_readv_from_socket(server, &iov, 1, to_read); +int +cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, + unsigned int to_read) +{ + struct msghdr smb_msg; + struct bio_vec bv = {.bv_page = page, .bv_len = to_read}; + iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read); + return cifs_readv_from_socket(server, &smb_msg); } static bool @@ -783,7 +727,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) } kfree(server->hostname); - kfree(server->iov); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -1196,8 +1139,12 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol) convert_delimiter(vol->UNC, '\\'); - /* If pos is NULL, or is a bogus trailing delimiter then no prepath */ - if (!*pos++ || !*pos) + /* skip any delimiter */ + if (*pos == '/' || *pos == '\\') + pos++; + + /* If pos is NULL then no prepath */ + if (!*pos) return 0; vol->prepath = kstrdup(pos, GFP_KERNEL); @@ -2918,7 +2865,7 @@ static inline void cifs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; - BUG_ON(sock_owned_by_user(sk)); + BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS", &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]); } @@ -2927,7 +2874,7 @@ static inline void cifs_reclassify_socket6(struct socket *sock) { struct sock *sk = sock->sk; - BUG_ON(sock_owned_by_user(sk)); + BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS", &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]); } @@ -3630,7 +3577,7 @@ try_mount_again: cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info); /* tune readahead according to rsize */ - cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; + cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE; remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ff882aeaccc6..9793ae0bcaa2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -271,7 +271,7 @@ struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifsFileInfo *cfile; @@ -461,7 +461,7 @@ int cifs_open(struct inode *inode, struct file *file) tcon = tlink_tcon(tlink); server = tcon->ses->server; - full_path = build_path_from_dentry(file->f_path.dentry); + full_path = build_path_from_dentry(file_dentry(file)); if (full_path == NULL) { rc = -ENOMEM; goto out; @@ -1833,7 +1833,7 @@ refind_writable: static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) { struct address_space *mapping = page->mapping; - loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + loff_t offset = (loff_t)page->index << PAGE_SHIFT; char *write_data; int rc = -EFAULT; int bytes_written = 0; @@ -1849,7 +1849,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) write_data = kmap(page); write_data += from; - if ((to > PAGE_CACHE_SIZE) || (from > to)) { + if ((to > PAGE_SIZE) || (from > to)) { kunmap(page); return -EIO; } @@ -1902,7 +1902,7 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, * find_get_pages_tag seems to return a max of 256 on each * iteration, so we must call it several times in order to * fill the array or the wsize is effectively limited to - * 256 * PAGE_CACHE_SIZE. + * 256 * PAGE_SIZE. */ *found_pages = 0; pages = wdata->pages; @@ -1991,7 +1991,7 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, /* put any pages we aren't going to use */ for (i = nr_pages; i < found_pages; i++) { - page_cache_release(wdata->pages[i]); + put_page(wdata->pages[i]); wdata->pages[i] = NULL; } @@ -2009,11 +2009,11 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, wdata->sync_mode = wbc->sync_mode; wdata->nr_pages = nr_pages; wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_CACHE_SIZE; + wdata->pagesz = PAGE_SIZE; wdata->tailsz = min(i_size_read(mapping->host) - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_CACHE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; + (loff_t)PAGE_SIZE); + wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz; if (wdata->cfile != NULL) cifsFileInfo_put(wdata->cfile); @@ -2047,15 +2047,15 @@ static int cifs_writepages(struct address_space *mapping, * If wsize is smaller than the page cache size, default to writing * one page at a time via cifs_writepage */ - if (cifs_sb->wsize < PAGE_CACHE_SIZE) + if (cifs_sb->wsize < PAGE_SIZE) return generic_writepages(mapping, wbc); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = true; scanned = true; @@ -2071,7 +2071,7 @@ retry: if (rc) break; - tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1; + tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1; wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, &found_pages); @@ -2111,7 +2111,7 @@ retry: else SetPageError(wdata->pages[i]); end_page_writeback(wdata->pages[i]); - page_cache_release(wdata->pages[i]); + put_page(wdata->pages[i]); } if (rc != -EAGAIN) mapping_set_error(mapping, rc); @@ -2154,7 +2154,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) xid = get_xid(); /* BB add check for wbc flags */ - page_cache_get(page); + get_page(page); if (!PageUptodate(page)) cifs_dbg(FYI, "ppw - page not up to date\n"); @@ -2170,7 +2170,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) */ set_page_writeback(page); retry_write: - rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE); + rc = cifs_partialpagewrite(page, 0, PAGE_SIZE); if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL) goto retry_write; else if (rc == -EAGAIN) @@ -2180,7 +2180,7 @@ retry_write: else SetPageUptodate(page); end_page_writeback(page); - page_cache_release(page); + put_page(page); free_xid(xid); return rc; } @@ -2214,12 +2214,12 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, if (copied == len) SetPageUptodate(page); ClearPageChecked(page); - } else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) + } else if (!PageUptodate(page) && copied == PAGE_SIZE) SetPageUptodate(page); if (!PageUptodate(page)) { char *page_data; - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned offset = pos & (PAGE_SIZE - 1); unsigned int xid; xid = get_xid(); @@ -2248,7 +2248,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, } unlock_page(page); - page_cache_release(page); + put_page(page); return rc; } @@ -2687,11 +2687,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) out: inode_unlock(inode); - if (rc > 0) { - ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); - if (err < 0) - rc = err; - } + if (rc > 0) + rc = generic_write_sync(iocb, rc); up_read(&cinode->lock_sem); return rc; } @@ -2855,39 +2852,31 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; - struct kvec iov; rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; + size_t n; - if (len >= PAGE_SIZE) { - /* enough data to fill the page */ - iov.iov_base = kmap(page); - iov.iov_len = PAGE_SIZE; - cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", - i, iov.iov_base, iov.iov_len); - len -= PAGE_SIZE; - } else if (len > 0) { - /* enough for partial page, fill and zero the rest */ - iov.iov_base = kmap(page); - iov.iov_len = len; - cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", - i, iov.iov_base, iov.iov_len); - memset(iov.iov_base + len, '\0', PAGE_SIZE - len); - rdata->tailsz = len; - len = 0; - } else { + if (len <= 0) { /* no need to hold page hostage */ rdata->pages[i] = NULL; rdata->nr_pages--; put_page(page); continue; } - - result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len); - kunmap(page); + n = len; + if (len >= PAGE_SIZE) { + /* enough data to fill the page */ + n = PAGE_SIZE; + len -= n; + } else { + zero_user(page, len, PAGE_SIZE - len); + rdata->tailsz = len; + len = 0; + } + result = cifs_read_page_from_socket(server, page, n); if (result < 0) break; @@ -3286,9 +3275,9 @@ cifs_readv_complete(struct work_struct *work) (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); - got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes); + got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); - page_cache_release(page); + put_page(page); rdata->pages[i] = NULL; } kref_put(&rdata->refcount, cifs_readdata_release); @@ -3303,34 +3292,24 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, u64 eof; pgoff_t eof_index; unsigned int nr_pages = rdata->nr_pages; - struct kvec iov; /* determine the eof that the server (probably) has */ eof = CIFS_I(rdata->mapping->host)->server_eof; - eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; + eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0; cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); rdata->got_bytes = 0; - rdata->tailsz = PAGE_CACHE_SIZE; + rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; + size_t n = PAGE_SIZE; - if (len >= PAGE_CACHE_SIZE) { - /* enough data to fill the page */ - iov.iov_base = kmap(page); - iov.iov_len = PAGE_CACHE_SIZE; - cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", - i, page->index, iov.iov_base, iov.iov_len); - len -= PAGE_CACHE_SIZE; + if (len >= PAGE_SIZE) { + len -= PAGE_SIZE; } else if (len > 0) { /* enough for partial page, fill and zero the rest */ - iov.iov_base = kmap(page); - iov.iov_len = len; - cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", - i, page->index, iov.iov_base, iov.iov_len); - memset(iov.iov_base + len, - '\0', PAGE_CACHE_SIZE - len); - rdata->tailsz = len; + zero_user(page, len, PAGE_SIZE - len); + n = rdata->tailsz = len; len = 0; } else if (page->index > eof_index) { /* @@ -3341,12 +3320,12 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, * to prevent the VFS from repeatedly attempting to * fill them until the writes are flushed. */ - zero_user(page, 0, PAGE_CACHE_SIZE); + zero_user(page, 0, PAGE_SIZE); lru_cache_add_file(page); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); - page_cache_release(page); + put_page(page); rdata->pages[i] = NULL; rdata->nr_pages--; continue; @@ -3354,14 +3333,13 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, /* no need to hold page hostage */ lru_cache_add_file(page); unlock_page(page); - page_cache_release(page); + put_page(page); rdata->pages[i] = NULL; rdata->nr_pages--; continue; } - result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len); - kunmap(page); + result = cifs_read_page_from_socket(server, page, n); if (result < 0) break; @@ -3402,8 +3380,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, } /* move first page to the tmplist */ - *offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - *bytes = PAGE_CACHE_SIZE; + *offset = (loff_t)page->index << PAGE_SHIFT; + *bytes = PAGE_SIZE; *nr_pages = 1; list_move_tail(&page->lru, tmplist); @@ -3415,7 +3393,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, break; /* would this page push the read over the rsize? */ - if (*bytes + PAGE_CACHE_SIZE > rsize) + if (*bytes + PAGE_SIZE > rsize) break; __SetPageLocked(page); @@ -3424,7 +3402,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, break; } list_move_tail(&page->lru, tmplist); - (*bytes) += PAGE_CACHE_SIZE; + (*bytes) += PAGE_SIZE; expected_index++; (*nr_pages)++; } @@ -3493,7 +3471,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * reach this point however since we set ra_pages to 0 when the * rsize is smaller than a cache page. */ - if (unlikely(rsize < PAGE_CACHE_SIZE)) { + if (unlikely(rsize < PAGE_SIZE)) { add_credits_and_wake_if(server, credits, 0); return 0; } @@ -3512,7 +3490,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, list_del(&page->lru); lru_cache_add_file(page); unlock_page(page); - page_cache_release(page); + put_page(page); } rc = -ENOMEM; add_credits_and_wake_if(server, credits, 0); @@ -3524,7 +3502,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->offset = offset; rdata->bytes = bytes; rdata->pid = pid; - rdata->pagesz = PAGE_CACHE_SIZE; + rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; rdata->credits = credits; @@ -3542,7 +3520,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, page = rdata->pages[i]; lru_cache_add_file(page); unlock_page(page); - page_cache_release(page); + put_page(page); } /* Fallback to the readpage in error/reconnect cases */ kref_put(&rdata->refcount, cifs_readdata_release); @@ -3577,7 +3555,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, read_data = kmap(page); /* for reads over a certain size could initiate async read ahead */ - rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset); + rc = cifs_read(file, read_data, PAGE_SIZE, poffset); if (rc < 0) goto io_error; @@ -3587,8 +3565,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page, file_inode(file)->i_atime = current_fs_time(file_inode(file)->i_sb); - if (PAGE_CACHE_SIZE > rc) - memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc); + if (PAGE_SIZE > rc) + memset(read_data + rc, 0, PAGE_SIZE - rc); flush_dcache_page(page); SetPageUptodate(page); @@ -3608,7 +3586,7 @@ read_complete: static int cifs_readpage(struct file *file, struct page *page) { - loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + loff_t offset = (loff_t)page->index << PAGE_SHIFT; int rc = -EACCES; unsigned int xid; @@ -3679,8 +3657,8 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int oncethru = 0; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - loff_t offset = pos & (PAGE_CACHE_SIZE - 1); + pgoff_t index = pos >> PAGE_SHIFT; + loff_t offset = pos & (PAGE_SIZE - 1); loff_t page_start = pos & PAGE_MASK; loff_t i_size; struct page *page; @@ -3703,7 +3681,7 @@ start: * the server. If the write is short, we'll end up doing a sync write * instead. */ - if (len == PAGE_CACHE_SIZE) + if (len == PAGE_SIZE) goto out; /* @@ -3718,7 +3696,7 @@ start: (offset == 0 && (pos + len) >= i_size)) { zero_user_segments(page, 0, offset, offset + len, - PAGE_CACHE_SIZE); + PAGE_SIZE); /* * PageChecked means that the parts of the page * to which we're not writing are considered up @@ -3737,7 +3715,7 @@ start: * do a sync write instead since PG_uptodate isn't set. */ cifs_readpage_worker(file, page, &page_start); - page_cache_release(page); + put_page(page); oncethru = 1; goto start; } else { @@ -3764,7 +3742,7 @@ static void cifs_invalidate_page(struct page *page, unsigned int offset, { struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); - if (offset == 0 && length == PAGE_CACHE_SIZE) + if (offset == 0 && length == PAGE_SIZE) cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); } @@ -3772,7 +3750,7 @@ static int cifs_launder_page(struct page *page) { int rc = 0; loff_t range_start = page_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); + loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, @@ -3854,7 +3832,7 @@ void cifs_oplock_break(struct work_struct *work) * Direct IO is not yet supported in the cached mode. */ static ssize_t -cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter) { /* * FIXME diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index aeb26dbfa1bf..514dadb0575d 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -59,7 +59,7 @@ static void cifs_set_ops(struct inode *inode) /* check if server can support readpages */ if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < - PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) + PAGE_SIZE + MAX_CIFS_HDR_SIZE) inode->i_data.a_ops = &cifs_addr_ops_smallbuf; else inode->i_data.a_ops = &cifs_addr_ops; @@ -2019,8 +2019,8 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, static int cifs_truncate_page(struct address_space *mapping, loff_t from) { - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; + unsigned offset = from & (PAGE_SIZE - 1); struct page *page; int rc = 0; @@ -2028,9 +2028,9 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) if (!page) return -ENOMEM; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); unlock_page(page); - page_cache_release(page); + put_page(page); return rc; } @@ -2418,8 +2418,7 @@ cifs_setattr_exit: int cifs_setattr(struct dentry *direntry, struct iattr *attrs) { - struct inode *inode = d_inode(direntry); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); if (pTcon->unix_ext) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b30a4a6d98a0..65cf85dcda09 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -78,20 +78,34 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, { struct dentry *dentry, *alias; struct inode *inode; - struct super_block *sb = d_inode(parent)->i_sb; + struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); dentry = d_hash_and_lookup(parent, name); + if (!dentry) { + /* + * If we know that the inode will need to be revalidated + * immediately, then don't create a new dentry for it. + * We'll end up doing an on the wire call either way and + * this spares us an invalidation. + */ + if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) + return; +retry: + dentry = d_alloc_parallel(parent, name, &wq); + } if (IS_ERR(dentry)) return; - - if (dentry) { + if (!d_in_lookup(dentry)) { inode = d_inode(dentry); if (inode) { - if (d_mountpoint(dentry)) - goto out; + if (d_mountpoint(dentry)) { + dput(dentry); + return; + } /* * If we're generating inode numbers, then we don't * want to clobber the existing one with the one that @@ -106,33 +120,22 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, (inode->i_mode & S_IFMT) == (fattr->cf_mode & S_IFMT)) { cifs_fattr_to_inode(inode, fattr); - goto out; + dput(dentry); + return; } } d_invalidate(dentry); dput(dentry); + goto retry; + } else { + inode = cifs_iget(sb, fattr); + if (!inode) + inode = ERR_PTR(-ENOMEM); + alias = d_splice_alias(inode, dentry); + d_lookup_done(dentry); + if (alias && !IS_ERR(alias)) + dput(alias); } - - /* - * If we know that the inode will need to be revalidated immediately, - * then don't create a new dentry for it. We'll end up doing an on - * the wire call either way and this spares us an invalidation. - */ - if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) - return; - - dentry = d_alloc(parent, name); - if (!dentry) - return; - - inode = cifs_iget(sb, fattr); - if (!inode) - goto out; - - alias = d_splice_alias(inode, dentry); - if (alias && !IS_ERR(alias)) - dput(alias); -out: dput(dentry); } @@ -300,7 +303,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file) cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; - full_path = build_path_from_dentry(file->f_path.dentry); + full_path = build_path_from_dentry(file_dentry(file)); if (full_path == NULL) { rc = -ENOMEM; goto error_exit; @@ -759,7 +762,7 @@ static int cifs_filldir(char *find_entry, struct file *file, */ fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; - cifs_prime_dcache(file->f_path.dentry, &name, &fattr); + cifs_prime_dcache(file_dentry(file), &name, &fattr); ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 59727e32ed0f..af0ec2d5ad0e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -400,19 +400,27 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.MaximumLength = 0; sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); - goto setup_ntlmv2_ret; + if (ses->user_name != NULL) { + rc = setup_ntlmv2_rsp(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); + goto setup_ntlmv2_ret; + } + memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + sec_blob->NtChallengeResponse.Length = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + sec_blob->NtChallengeResponse.MaximumLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + } else { + /* + * don't send an NT Response for anonymous access + */ + sec_blob->NtChallengeResponse.Length = 0; + sec_blob->NtChallengeResponse.MaximumLength = 0; } - memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - - sec_blob->NtChallengeResponse.Length = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); - sec_blob->NtChallengeResponse.MaximumLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); if (ses->domainName == NULL) { sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); @@ -670,20 +678,24 @@ sess_auth_lanman(struct sess_data *sess_data) pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; - /* no capabilities flags in old lanman negotiation */ - pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - - /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ - rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); - - memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; + if (ses->user_name != NULL) { + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key); + + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + } else { + pSMB->old_req.PasswordLength = 0; + } /* * can not sign if LANMAN negotiated so no need @@ -769,26 +781,31 @@ sess_auth_ntlm(struct sess_data *sess_data) capabilities = cifs_ssetup_hdr(ses, pSMB); pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - pSMB->req_no_secext.CaseInsensitivePasswordLength = - cpu_to_le16(CIFS_AUTH_RESP_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(CIFS_AUTH_RESP_SIZE); - - /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses, sess_data->nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLM authentication\n", - rc); - goto out; - } + if (ses->user_name != NULL) { + pSMB->req_no_secext.CaseInsensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLM authentication\n", + rc); + goto out; + } - /* copy ntlm response */ - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + } else { + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + pSMB->req_no_secext.CaseSensitivePasswordLength = 0; + } if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ @@ -878,22 +895,26 @@ sess_auth_ntlmv2(struct sess_data *sess_data) /* LM2 password would be here if we supported it */ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - /* calculate nlmv2 response and session key */ - rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); - goto out; - } + if (ses->user_name != NULL) { + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); + goto out; + } - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - /* set case sensitive password length after tilen may get - * assigned, tilen is 0 otherwise. - */ - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + } else { + pSMB->req_no_secext.CaseSensitivePasswordLength = 0; + } if (ses->capabilities & CAP_UNICODE) { if (sess_data->iov[0].iov_len % 2) { diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index bc0bb9c34f72..0ffa18094335 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -44,6 +44,7 @@ #define SMB2_OP_DELETE 7 #define SMB2_OP_HARDLINK 8 #define SMB2_OP_SET_EOF 9 +#define SMB2_OP_RMDIR 10 /* Used when constructing chained read requests. */ #define CHAINED_REQUEST 1 diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 899bbc86f73e..4f0231e685a9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -80,6 +80,10 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, * SMB2_open() call. */ break; + case SMB2_OP_RMDIR: + tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid, + fid.volatile_fid); + break; case SMB2_OP_RENAME: tmprc = SMB2_rename(xid, tcon, fid.persistent_fid, fid.volatile_fid, (__le16 *)data); @@ -191,8 +195,8 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, - NULL, SMB2_OP_DELETE); + CREATE_NOT_FILE, + NULL, SMB2_OP_RMDIR); } int diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 42e1f440eb1e..8f38e33d365b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2575,6 +2575,22 @@ SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, } int +SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + __u8 delete_pending = 1; + void *data; + unsigned int size; + + data = &delete_pending; + size = 1; /* sizeof __u8 */ + + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_DISPOSITION_INFORMATION, 1, &data, + &size); +} + +int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le16 *target_file) { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 4f07dc93608d..eb2cde2f64ba 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -141,6 +141,8 @@ extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le16 *target_file); +extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid); extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le16 *target_file); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 8732a43b1008..bc9a7b634643 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -135,11 +135,10 @@ smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - int i, rc; + int rc; unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - int n_vec = rqst->rq_nvec; struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; @@ -171,53 +170,11 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } - for (i = 0; i < n_vec; i++) { - if (iov[i].iov_len == 0) - continue; - if (iov[i].iov_base == NULL) { - cifs_dbg(VFS, "null iovec entry\n"); - return -EIO; - } - /* - * The first entry includes a length field (which does not get - * signed that occupies the first 4 bytes before the header). - */ - if (i == 0) { - if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ - break; /* nothing to sign or corrupt header */ - rc = - crypto_shash_update( - &server->secmech.sdeschmacsha256->shash, - iov[i].iov_base + 4, iov[i].iov_len - 4); - } else { - rc = - crypto_shash_update( - &server->secmech.sdeschmacsha256->shash, - iov[i].iov_base, iov[i].iov_len); - } - if (rc) { - cifs_dbg(VFS, "%s: Could not update with payload\n", - __func__); - return rc; - } - } - - /* now hash over the rq_pages array */ - for (i = 0; i < rqst->rq_npages; i++) { - struct kvec p_iov; - - cifs_rqst_page_to_kvec(rqst, i, &p_iov); - crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - p_iov.iov_base, p_iov.iov_len); - kunmap(rqst->rq_pages[i]); - } - - rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, - sigptr); - if (rc) - cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); + rc = __cifs_calc_signature(rqst, server, sigptr, + &server->secmech.sdeschmacsha256->shash); - memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + if (!rc) + memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); return rc; } @@ -395,12 +352,10 @@ generate_smb311signingkey(struct cifs_ses *ses) int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - int i; int rc = 0; unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - int n_vec = rqst->rq_nvec; struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; @@ -431,54 +386,12 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); return rc; } + + rc = __cifs_calc_signature(rqst, server, sigptr, + &server->secmech.sdesccmacaes->shash); - for (i = 0; i < n_vec; i++) { - if (iov[i].iov_len == 0) - continue; - if (iov[i].iov_base == NULL) { - cifs_dbg(VFS, "null iovec entry"); - return -EIO; - } - /* - * The first entry includes a length field (which does not get - * signed that occupies the first 4 bytes before the header). - */ - if (i == 0) { - if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ - break; /* nothing to sign or corrupt header */ - rc = - crypto_shash_update( - &server->secmech.sdesccmacaes->shash, - iov[i].iov_base + 4, iov[i].iov_len - 4); - } else { - rc = - crypto_shash_update( - &server->secmech.sdesccmacaes->shash, - iov[i].iov_base, iov[i].iov_len); - } - if (rc) { - cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n", - __func__); - return rc; - } - } - - /* now hash over the rq_pages array */ - for (i = 0; i < rqst->rq_npages; i++) { - struct kvec p_iov; - - cifs_rqst_page_to_kvec(rqst, i, &p_iov); - crypto_shash_update(&server->secmech.sdesccmacaes->shash, - p_iov.iov_base, p_iov.iov_len); - kunmap(rqst->rq_pages[i]); - } - - rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash, - sigptr); - if (rc) - cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__); - - memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + if (!rc) + memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 87abe8ed074c..206a597b2293 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -124,41 +124,32 @@ cifs_delete_mid(struct mid_q_entry *mid) /* * smb_send_kvec - send an array of kvecs to the server * @server: Server to send the data to - * @iov: Pointer to array of kvecs - * @n_vec: length of kvec array + * @smb_msg: Message to send * @sent: amount of data sent on socket is stored here * * Our basic "send data to server" function. Should be called with srv_mutex * held. The caller is responsible for handling the results. */ static int -smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, - size_t *sent) +smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, + size_t *sent) { int rc = 0; - int i = 0; - struct msghdr smb_msg; - unsigned int remaining; - size_t first_vec = 0; + int retries = 0; struct socket *ssocket = server->ssocket; *sent = 0; - smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; - smb_msg.msg_namelen = sizeof(struct sockaddr); - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; + smb_msg->msg_name = (struct sockaddr *) &server->dstaddr; + smb_msg->msg_namelen = sizeof(struct sockaddr); + smb_msg->msg_control = NULL; + smb_msg->msg_controllen = 0; if (server->noblocksnd) - smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; + smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; else - smb_msg.msg_flags = MSG_NOSIGNAL; - - remaining = 0; - for (i = 0; i < n_vec; i++) - remaining += iov[i].iov_len; + smb_msg->msg_flags = MSG_NOSIGNAL; - i = 0; - while (remaining) { + while (msg_data_left(smb_msg)) { /* * If blocking send, we try 3 times, since each can block * for 5 seconds. For nonblocking we have to try more @@ -177,35 +168,21 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, * after the retries we will kill the socket and * reconnect which may clear the network problem. */ - rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], - n_vec - first_vec, remaining); + rc = sock_sendmsg(ssocket, smb_msg); if (rc == -EAGAIN) { - i++; - if (i >= 14 || (!server->noblocksnd && (i > 2))) { + retries++; + if (retries >= 14 || + (!server->noblocksnd && (retries > 2))) { cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", ssocket); - rc = -EAGAIN; - break; + return -EAGAIN; } - msleep(1 << i); + msleep(1 << retries); continue; } if (rc < 0) - break; - - /* send was at least partially successful */ - *sent += rc; - - if (rc == remaining) { - remaining = 0; - break; - } - - if (rc > remaining) { - cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining); - break; - } + return rc; if (rc == 0) { /* should never happen, letting socket clear before @@ -215,59 +192,11 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, continue; } - remaining -= rc; - - /* the line below resets i */ - for (i = first_vec; i < n_vec; i++) { - if (iov[i].iov_len) { - if (rc > iov[i].iov_len) { - rc -= iov[i].iov_len; - iov[i].iov_len = 0; - } else { - iov[i].iov_base += rc; - iov[i].iov_len -= rc; - first_vec = i; - break; - } - } - } - - i = 0; /* in case we get ENOSPC on the next send */ - rc = 0; + /* send was at least partially successful */ + *sent += rc; + retries = 0; /* in case we get ENOSPC on the next send */ } - return rc; -} - -/** - * rqst_page_to_kvec - Turn a slot in the smb_rqst page array into a kvec - * @rqst: pointer to smb_rqst - * @idx: index into the array of the page - * @iov: pointer to struct kvec that will hold the result - * - * Helper function to convert a slot in the rqst->rq_pages array into a kvec. - * The page will be kmapped and the address placed into iov_base. The length - * will then be adjusted according to the ptailoff. - */ -void -cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, - struct kvec *iov) -{ - /* - * FIXME: We could avoid this kmap altogether if we used - * kernel_sendpage instead of kernel_sendmsg. That will only - * work if signing is disabled though as sendpage inlines the - * page directly into the fraglist. If userspace modifies the - * page after we calculate the signature, then the server will - * reject it and may break the connection. kernel_sendmsg does - * an extra copy of the data and avoids that issue. - */ - iov->iov_base = kmap(rqst->rq_pages[idx]); - - /* if last page, don't send beyond this offset into page */ - if (idx == (rqst->rq_npages - 1)) - iov->iov_len = rqst->rq_tailsz; - else - iov->iov_len = rqst->rq_pagesz; + return 0; } static unsigned long @@ -299,8 +228,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); unsigned long send_length; unsigned int i; - size_t total_len = 0, sent; + size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; + struct msghdr smb_msg; int val = 1; if (ssocket == NULL) @@ -321,7 +251,13 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); - rc = smb_send_kvec(server, iov, n_vec, &sent); + size = 0; + for (i = 0; i < n_vec; i++) + size += iov[i].iov_len; + + iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, iov, n_vec, size); + + rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) goto uncork; @@ -329,11 +265,16 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) /* now walk the page array and send each page in it */ for (i = 0; i < rqst->rq_npages; i++) { - struct kvec p_iov; - - cifs_rqst_page_to_kvec(rqst, i, &p_iov); - rc = smb_send_kvec(server, &p_iov, 1, &sent); - kunmap(rqst->rq_pages[i]); + size_t len = i == rqst->rq_npages - 1 + ? rqst->rq_tailsz + : rqst->rq_pagesz; + struct bio_vec bvec = { + .bv_page = rqst->rq_pages[i], + .bv_len = len + }; + iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, + &bvec, 1, len); + rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) break; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index f5dc2f0df4ad..c8b77aa24a1d 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -32,92 +32,24 @@ #include "cifs_unicode.h" #define MAX_EA_VALUE_SIZE 65535 -#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* BB need to add server (Samba e.g) support for security and trusted prefix */ -int cifs_removexattr(struct dentry *direntry, const char *ea_name) -{ - int rc = -EOPNOTSUPP; -#ifdef CONFIG_CIFS_XATTR - unsigned int xid; - struct cifs_sb_info *cifs_sb; - struct tcon_link *tlink; - struct cifs_tcon *pTcon; - struct super_block *sb; - char *full_path = NULL; - - if (direntry == NULL) - return -EIO; - if (d_really_is_negative(direntry)) - return -EIO; - sb = d_inode(direntry)->i_sb; - if (sb == NULL) - return -EIO; - - cifs_sb = CIFS_SB(sb); - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); - - xid = get_xid(); - - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; - goto remove_ea_exit; - } - if (ea_name == NULL) { - cifs_dbg(FYI, "Null xattr names not supported\n"); - } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) - && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { - cifs_dbg(FYI, - "illegal xattr request %s (only user namespace supported)\n", - ea_name); - /* BB what if no namespace prefix? */ - /* Should we just pass them to server, except for - system and perhaps security prefixes? */ - } else { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) - goto remove_ea_exit; +enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; - ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - if (pTcon->ses->server->ops->set_EA) - rc = pTcon->ses->server->ops->set_EA(xid, pTcon, - full_path, ea_name, NULL, (__u16)0, - cifs_sb->local_nls, cifs_remap(cifs_sb)); - } -remove_ea_exit: - kfree(full_path); - free_xid(xid); - cifs_put_tlink(tlink); -#endif - return rc; -} - -int cifs_setxattr(struct dentry *direntry, const char *ea_name, - const void *ea_value, size_t value_size, int flags) +static int cifs_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { int rc = -EOPNOTSUPP; -#ifdef CONFIG_CIFS_XATTR unsigned int xid; - struct cifs_sb_info *cifs_sb; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - struct super_block *sb; char *full_path; - if (direntry == NULL) - return -EIO; - if (d_really_is_negative(direntry)) - return -EIO; - sb = d_inode(direntry)->i_sb; - if (sb == NULL) - return -EIO; - - cifs_sb = CIFS_SB(sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -125,10 +57,10 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, xid = get_xid(); - full_path = build_path_from_dentry(direntry); + full_path = build_path_from_dentry(dentry); if (full_path == NULL) { rc = -ENOMEM; - goto set_ea_exit; + goto out; } /* return dos attributes as pseudo xattr */ /* return alt name if available as pseudo attr */ @@ -136,123 +68,93 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, /* if proc/fs/cifs/streamstoxattr is set then search server for EAs or streams to returns as xattrs */ - if (value_size > MAX_EA_VALUE_SIZE) { + if (size > MAX_EA_VALUE_SIZE) { cifs_dbg(FYI, "size of EA value too large\n"); rc = -EOPNOTSUPP; - goto set_ea_exit; + goto out; } - if (ea_name == NULL) { - cifs_dbg(FYI, "Null xattr names not supported\n"); - } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) - == 0) { + switch (handler->flags) { + case XATTR_USER: if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) - goto set_ea_exit; - if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) - cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); + goto out; - ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ if (pTcon->ses->server->ops->set_EA) rc = pTcon->ses->server->ops->set_EA(xid, pTcon, - full_path, ea_name, ea_value, (__u16)value_size, + full_path, name, value, (__u16)size, cifs_sb->local_nls, cifs_remap(cifs_sb)); - } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) - == 0) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) - goto set_ea_exit; + break; - ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - if (pTcon->ses->server->ops->set_EA) - rc = pTcon->ses->server->ops->set_EA(xid, pTcon, - full_path, ea_name, ea_value, (__u16)value_size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); - } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, - strlen(CIFS_XATTR_CIFS_ACL)) == 0) { + case XATTR_CIFS_ACL: { #ifdef CONFIG_CIFS_ACL struct cifs_ntsd *pacl; - pacl = kmalloc(value_size, GFP_KERNEL); + + if (!value) + goto out; + pacl = kmalloc(size, GFP_KERNEL); if (!pacl) { rc = -ENOMEM; } else { - memcpy(pacl, ea_value, value_size); - if (pTcon->ses->server->ops->set_acl) + memcpy(pacl, value, size); + if (value && + pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - value_size, d_inode(direntry), + size, d_inode(dentry), full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(d_inode(direntry))->time = 0; + CIFS_I(d_inode(dentry))->time = 0; kfree(pacl); } -#else - cifs_dbg(FYI, "Set CIFS ACL not supported yet\n"); #endif /* CONFIG_CIFS_ACL */ - } else { - int temp; - temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS, - strlen(XATTR_NAME_POSIX_ACL_ACCESS)); - if (temp == 0) { + break; + } + + case XATTR_ACL_ACCESS: #ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & MS_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - ea_value, (const int)value_size, - ACL_TYPE_ACCESS, cifs_sb->local_nls, - cifs_remap(cifs_sb)); - cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc); -#else - cifs_dbg(FYI, "set POSIX ACL not supported\n"); -#endif - } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT, - strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) { + if (!value) + goto out; + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, + value, (const int)size, + ACL_TYPE_ACCESS, cifs_sb->local_nls, + cifs_remap(cifs_sb)); +#endif /* CONFIG_CIFS_POSIX */ + break; + + case XATTR_ACL_DEFAULT: #ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & MS_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - ea_value, (const int)value_size, - ACL_TYPE_DEFAULT, cifs_sb->local_nls, - cifs_remap(cifs_sb)); - cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc); -#else - cifs_dbg(FYI, "set default POSIX ACL not supported\n"); -#endif - } else { - cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n", - ea_name); - /* BB what if no namespace prefix? */ - /* Should we just pass them to server, except for - system and perhaps security prefixes? */ - } + if (!value) + goto out; + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, + value, (const int)size, + ACL_TYPE_DEFAULT, cifs_sb->local_nls, + cifs_remap(cifs_sb)); +#endif /* CONFIG_CIFS_POSIX */ + break; } -set_ea_exit: +out: kfree(full_path); free_xid(xid); cifs_put_tlink(tlink); -#endif return rc; } -ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, - void *ea_value, size_t buf_size) +static int cifs_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) { ssize_t rc = -EOPNOTSUPP; -#ifdef CONFIG_CIFS_XATTR unsigned int xid; - struct cifs_sb_info *cifs_sb; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - struct super_block *sb; char *full_path; - if (direntry == NULL) - return -EIO; - if (d_really_is_negative(direntry)) - return -EIO; - sb = d_inode(direntry)->i_sb; - if (sb == NULL) - return -EIO; - - cifs_sb = CIFS_SB(sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -260,98 +162,72 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, xid = get_xid(); - full_path = build_path_from_dentry(direntry); + full_path = build_path_from_dentry(dentry); if (full_path == NULL) { rc = -ENOMEM; - goto get_ea_exit; + goto out; } /* return dos attributes as pseudo xattr */ /* return alt name if available as pseudo attr */ - if (ea_name == NULL) { - cifs_dbg(FYI, "Null xattr names not supported\n"); - } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) - == 0) { + switch (handler->flags) { + case XATTR_USER: if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) - goto get_ea_exit; + goto out; - if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { - cifs_dbg(FYI, "attempt to query cifs inode metadata\n"); - /* revalidate/getattr then populate from inode */ - } /* BB add else when above is implemented */ - ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, ea_name, ea_value, buf_size, + full_path, name, value, size, cifs_sb->local_nls, cifs_remap(cifs_sb)); - } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) - goto get_ea_exit; + break; - ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - if (pTcon->ses->server->ops->query_all_EAs) - rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, ea_name, ea_value, buf_size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); - } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS, - strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) { + case XATTR_CIFS_ACL: { +#ifdef CONFIG_CIFS_ACL + u32 acllen; + struct cifs_ntsd *pacl; + + if (pTcon->ses->server->ops->get_acl == NULL) + goto out; /* rc already EOPNOTSUPP */ + + pacl = pTcon->ses->server->ops->get_acl(cifs_sb, + inode, full_path, &acllen); + if (IS_ERR(pacl)) { + rc = PTR_ERR(pacl); + cifs_dbg(VFS, "%s: error %zd getting sec desc\n", + __func__, rc); + } else { + if (value) { + if (acllen > size) + acllen = -ERANGE; + else + memcpy(value, pacl, acllen); + } + rc = acllen; + kfree(pacl); + } +#endif /* CONFIG_CIFS_ACL */ + break; + } + + case XATTR_ACL_ACCESS: #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - ea_value, buf_size, ACL_TYPE_ACCESS, + value, size, ACL_TYPE_ACCESS, cifs_sb->local_nls, cifs_remap(cifs_sb)); -#else - cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); -#endif /* CONFIG_CIFS_POSIX */ - } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT, - strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) { +#endif /* CONFIG_CIFS_POSIX */ + break; + + case XATTR_ACL_DEFAULT: #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - ea_value, buf_size, ACL_TYPE_DEFAULT, + value, size, ACL_TYPE_DEFAULT, cifs_sb->local_nls, cifs_remap(cifs_sb)); -#else - cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n"); -#endif /* CONFIG_CIFS_POSIX */ - } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, - strlen(CIFS_XATTR_CIFS_ACL)) == 0) { -#ifdef CONFIG_CIFS_ACL - u32 acllen; - struct cifs_ntsd *pacl; - - if (pTcon->ses->server->ops->get_acl == NULL) - goto get_ea_exit; /* rc already EOPNOTSUPP */ - - pacl = pTcon->ses->server->ops->get_acl(cifs_sb, - d_inode(direntry), full_path, &acllen); - if (IS_ERR(pacl)) { - rc = PTR_ERR(pacl); - cifs_dbg(VFS, "%s: error %zd getting sec desc\n", - __func__, rc); - } else { - if (ea_value) { - if (acllen > buf_size) - acllen = -ERANGE; - else - memcpy(ea_value, pacl, acllen); - } - rc = acllen; - kfree(pacl); - } -#else - cifs_dbg(FYI, "Query CIFS ACL not supported yet\n"); -#endif /* CONFIG_CIFS_ACL */ - } else if (strncmp(ea_name, - XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { - cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n"); - } else if (strncmp(ea_name, - XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { - cifs_dbg(FYI, "Security xattr namespace not supported yet\n"); - } else - cifs_dbg(FYI, - "illegal xattr request %s (only user namespace supported)\n", - ea_name); +#endif /* CONFIG_CIFS_POSIX */ + break; + } /* We could add an additional check for streams ie if proc/fs/cifs/streamstoxattr is set then @@ -361,34 +237,22 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, if (rc == -EINVAL) rc = -EOPNOTSUPP; -get_ea_exit: +out: kfree(full_path); free_xid(xid); cifs_put_tlink(tlink); -#endif return rc; } ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) { ssize_t rc = -EOPNOTSUPP; -#ifdef CONFIG_CIFS_XATTR unsigned int xid; - struct cifs_sb_info *cifs_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - struct super_block *sb; char *full_path; - if (direntry == NULL) - return -EIO; - if (d_really_is_negative(direntry)) - return -EIO; - sb = d_inode(direntry)->i_sb; - if (sb == NULL) - return -EIO; - - cifs_sb = CIFS_SB(sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) return -EOPNOTSUPP; @@ -419,6 +283,50 @@ list_ea_exit: kfree(full_path); free_xid(xid); cifs_put_tlink(tlink); -#endif return rc; } + +static const struct xattr_handler cifs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = XATTR_USER, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +/* os2.* attributes are treated like user.* attributes */ +static const struct xattr_handler cifs_os2_xattr_handler = { + .prefix = XATTR_OS2_PREFIX, + .flags = XATTR_USER, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +static const struct xattr_handler cifs_cifs_acl_xattr_handler = { + .name = CIFS_XATTR_CIFS_ACL, + .flags = XATTR_CIFS_ACL, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +static const struct xattr_handler cifs_posix_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = XATTR_ACL_ACCESS, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +static const struct xattr_handler cifs_posix_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = XATTR_ACL_DEFAULT, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +const struct xattr_handler *cifs_xattr_handlers[] = { + &cifs_user_xattr_handler, + &cifs_os2_xattr_handler, + &cifs_cifs_acl_xattr_handler, + &cifs_posix_acl_access_xattr_handler, + &cifs_posix_acl_default_xattr_handler, + NULL +}; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 42e731b8c80a..6fb8672c0892 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -424,16 +424,22 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (host_file->f_op->iterate) { + if (host_file->f_op->iterate || host_file->f_op->iterate_shared) { struct inode *host_inode = file_inode(host_file); - - inode_lock(host_inode); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - ret = host_file->f_op->iterate(host_file, ctx); - file_accessed(host_file); + if (host_file->f_op->iterate_shared) { + inode_lock_shared(host_inode); + ret = host_file->f_op->iterate_shared(host_file, ctx); + file_accessed(host_file); + inode_unlock_shared(host_inode); + } else { + inode_lock(host_inode); + ret = host_file->f_op->iterate(host_file, ctx); + file_accessed(host_file); + inode_unlock(host_inode); + } } - inode_unlock(host_inode); return ret; } /* Venus: we must read Venus dirents from a file */ diff --git a/fs/compat.c b/fs/compat.c index a71936a3f4cb..be6e48b0a46c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -884,7 +884,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, .dirent = dirent @@ -897,7 +897,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, if (buf.result) error = buf.result; - fdput(f); + fdput_pos(f); return error; } @@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, } dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } @@ -975,7 +977,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - f = fdget(fd); + f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -989,7 +991,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, else error = count - buf.count; } - fdput(f); + fdput_pos(f); return error; } @@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name, dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user_unaligned(offset, &dirent->d_off)) goto efault; } @@ -1062,7 +1066,7 @@ COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - f = fdget(fd); + f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -1077,7 +1081,7 @@ COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, else error = count - buf.count; } - fdput(f); + fdput_pos(f); return error; } #endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ea59c891fc53..56fb26127fef 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -494,7 +494,7 @@ out: * If there is an error, the caller will reset the flags via * configfs_detach_rollback(). */ -static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex) +static int configfs_detach_prep(struct dentry *dentry, struct dentry **wait) { struct configfs_dirent *parent_sd = dentry->d_fsdata; struct configfs_dirent *sd; @@ -515,8 +515,8 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex if (sd->s_type & CONFIGFS_USET_DEFAULT) { /* Abort if racing with mkdir() */ if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { - if (wait_mutex) - *wait_mutex = &d_inode(sd->s_dentry)->i_mutex; + if (wait) + *wait= dget(sd->s_dentry); return -EAGAIN; } @@ -524,7 +524,7 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex * Yup, recursive. If there's a problem, blame * deep nesting of default_groups */ - ret = configfs_detach_prep(sd->s_dentry, wait_mutex); + ret = configfs_detach_prep(sd->s_dentry, wait); if (!ret) continue; } else @@ -1458,7 +1458,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) * the new link is temporarily attached */ do { - struct mutex *wait_mutex; + struct dentry *wait; mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); @@ -1469,7 +1469,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) */ ret = sd->s_dependent_count ? -EBUSY : 0; if (!ret) { - ret = configfs_detach_prep(dentry, &wait_mutex); + ret = configfs_detach_prep(dentry, &wait); if (ret) configfs_detach_rollback(dentry); } @@ -1483,8 +1483,9 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) } /* Wait until the racing operation terminates */ - mutex_lock(wait_mutex); - mutex_unlock(wait_mutex); + inode_lock(d_inode(wait)); + inode_unlock(d_inode(wait)); + dput(wait); } } while (ret == -EAGAIN); @@ -1632,11 +1633,9 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - if (ctx->pos == 2) { - spin_lock(&configfs_dirent_lock); + spin_lock(&configfs_dirent_lock); + if (ctx->pos == 2) list_move(q, &parent_sd->s_children); - spin_unlock(&configfs_dirent_lock); - } for (p = q->next; p != &parent_sd->s_children; p = p->next) { struct configfs_dirent *next; const char *name; @@ -1647,9 +1646,6 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) if (!next->s_element) continue; - name = configfs_get_name(next); - len = strlen(name); - /* * We'll have a dentry and an inode for * PINNED items and for open attribute @@ -1663,7 +1659,6 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) * they close it. Beyond that, we don't * care. */ - spin_lock(&configfs_dirent_lock); dentry = next->s_dentry; if (dentry) inode = d_inode(dentry); @@ -1673,15 +1668,18 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) if (!inode) ino = iunique(sb, 2); + name = configfs_get_name(next); + len = strlen(name); + if (!dir_emit(ctx, name, len, ino, dt_type(next))) return 0; spin_lock(&configfs_dirent_lock); list_move(q, p); - spin_unlock(&configfs_dirent_lock); p = q; ctx->pos++; } + spin_unlock(&configfs_dirent_lock); return 0; } @@ -1689,7 +1687,6 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -1697,7 +1694,6 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -1723,7 +1719,6 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - inode_unlock(d_inode(dentry)); return offset; } @@ -1732,7 +1727,7 @@ const struct file_operations configfs_dir_operations = { .release = configfs_dir_close, .llseek = configfs_dir_lseek, .read = generic_read_dir, - .iterate = configfs_readdir, + .iterate_shared = configfs_readdir, }; /** diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 03d124ae27d7..0387968e6f47 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -156,7 +156,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, if (depth > 0) { if (depth <= ARRAY_SIZE(default_group_class)) { - lockdep_set_class(&inode->i_mutex, + lockdep_set_class(&inode->i_rwsem, &default_group_class[depth - 1]); } else { /* diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index a8f3b589a2df..cfd91320e869 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -71,8 +71,8 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; struct dentry *root; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = CONFIGFS_MAGIC; sb->s_op = &configfs_ops; sb->s_time_gran = 1; diff --git a/fs/coredump.c b/fs/coredump.c index 47c32c3bfa1d..38a7ab87e10a 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -413,7 +413,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_state->dumper.task = tsk; core_state->dumper.next = NULL; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (!mm->core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); @@ -803,12 +805,9 @@ int dump_skip(struct coredump_params *cprm, size_t nr) static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (cprm->written + nr > cprm->limit) - return 0; if (dump_interrupted() || file->f_op->llseek(file, nr, SEEK_CUR) < 0) return 0; - cprm->written += nr; return 1; } else { while (nr > PAGE_SIZE) { @@ -823,7 +822,7 @@ EXPORT_SYMBOL(dump_skip); int dump_align(struct coredump_params *cprm, int align) { - unsigned mod = cprm->written & (align - 1); + unsigned mod = cprm->file->f_pos & (align - 1); if (align & (align - 1)) return 0; return mod ? dump_skip(cprm, align - mod) : 1; diff --git a/fs/cramfs/README b/fs/cramfs/README index 445d1c2d7646..9d4e7ea311f4 100644 --- a/fs/cramfs/README +++ b/fs/cramfs/README @@ -86,26 +86,26 @@ Block Size (Block size in cramfs refers to the size of input data that is compressed at a time. It's intended to be somewhere around -PAGE_CACHE_SIZE for cramfs_readpage's convenience.) +PAGE_SIZE for cramfs_readpage's convenience.) The superblock ought to indicate the block size that the fs was written for, since comments in <linux/pagemap.h> indicate that -PAGE_CACHE_SIZE may grow in future (if I interpret the comment +PAGE_SIZE may grow in future (if I interpret the comment correctly). -Currently, mkcramfs #define's PAGE_CACHE_SIZE as 4096 and uses that -for blksize, whereas Linux-2.3.39 uses its PAGE_CACHE_SIZE, which in +Currently, mkcramfs #define's PAGE_SIZE as 4096 and uses that +for blksize, whereas Linux-2.3.39 uses its PAGE_SIZE, which in turn is defined as PAGE_SIZE (which can be as large as 32KB on arm). This discrepancy is a bug, though it's not clear which should be changed. -One option is to change mkcramfs to take its PAGE_CACHE_SIZE from +One option is to change mkcramfs to take its PAGE_SIZE from <asm/page.h>. Personally I don't like this option, but it does require the least amount of change: just change `#define -PAGE_CACHE_SIZE (4096)' to `#include <asm/page.h>'. The disadvantage +PAGE_SIZE (4096)' to `#include <asm/page.h>'. The disadvantage is that the generated cramfs cannot always be shared between different kernels, not even necessarily kernels of the same architecture if -PAGE_CACHE_SIZE is subject to change between kernel versions +PAGE_SIZE is subject to change between kernel versions (currently possible with arm and ia64). The remaining options try to make cramfs more sharable. @@ -126,22 +126,22 @@ size. The options are: 1. Always 4096 bytes. 2. Writer chooses blocksize; kernel adapts but rejects blocksize > - PAGE_CACHE_SIZE. + PAGE_SIZE. 3. Writer chooses blocksize; kernel adapts even to blocksize > - PAGE_CACHE_SIZE. + PAGE_SIZE. It's easy enough to change the kernel to use a smaller value than -PAGE_CACHE_SIZE: just make cramfs_readpage read multiple blocks. +PAGE_SIZE: just make cramfs_readpage read multiple blocks. -The cost of option 1 is that kernels with a larger PAGE_CACHE_SIZE +The cost of option 1 is that kernels with a larger PAGE_SIZE value don't get as good compression as they can. The cost of option 2 relative to option 1 is that the code uses variables instead of #define'd constants. The gain is that people -with kernels having larger PAGE_CACHE_SIZE can make use of that if +with kernels having larger PAGE_SIZE can make use of that if they don't mind their cramfs being inaccessible to kernels with -smaller PAGE_CACHE_SIZE values. +smaller PAGE_SIZE values. Option 3 is easy to implement if we don't mind being CPU-inefficient: e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b862bc219cd7..7919967488cb 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -137,7 +137,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, * page cache and dentry tree anyway.. * * This also acts as a way to guarantee contiguous areas of up to - * BLKS_PER_BUF*PAGE_CACHE_SIZE, so that the caller doesn't need to + * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to * worry about end-of-buffer issues even when decompressing a full * page cache. */ @@ -152,7 +152,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, */ #define BLKS_PER_BUF_SHIFT (2) #define BLKS_PER_BUF (1 << BLKS_PER_BUF_SHIFT) -#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_CACHE_SIZE) +#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_SIZE) static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; static unsigned buffer_blocknr[READ_BUFFERS]; @@ -173,8 +173,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i if (!len) return NULL; - blocknr = offset >> PAGE_CACHE_SHIFT; - offset &= PAGE_CACHE_SIZE - 1; + blocknr = offset >> PAGE_SHIFT; + offset &= PAGE_SIZE - 1; /* Check if an existing buffer already has the data.. */ for (i = 0; i < READ_BUFFERS; i++) { @@ -184,14 +184,14 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i continue; if (blocknr < buffer_blocknr[i]) continue; - blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT; + blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT; blk_offset += offset; if (blk_offset + len > BUFFER_SIZE) continue; return read_buffers[i] + blk_offset; } - devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT; + devsize = mapping->host->i_size >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ for (i = 0; i < BLKS_PER_BUF; i++) { @@ -213,7 +213,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i wait_on_page_locked(page); if (!PageUptodate(page)) { /* asynchronous error */ - page_cache_release(page); + put_page(page); pages[i] = NULL; } } @@ -229,12 +229,12 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i struct page *page = pages[i]; if (page) { - memcpy(data, kmap(page), PAGE_CACHE_SIZE); + memcpy(data, kmap(page), PAGE_SIZE); kunmap(page); - page_cache_release(page); + put_page(page); } else - memset(data, 0, PAGE_CACHE_SIZE); - data += PAGE_CACHE_SIZE; + memset(data, 0, PAGE_SIZE); + data += PAGE_SIZE; } return read_buffers[buffer] + offset; } @@ -353,7 +353,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = CRAMFS_MAGIC; - buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_bsize = PAGE_SIZE; buf->f_blocks = CRAMFS_SB(sb)->blocks; buf->f_bfree = 0; buf->f_bavail = 0; @@ -496,7 +496,7 @@ static int cramfs_readpage(struct file *file, struct page *page) int bytes_filled; void *pgdata; - maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; bytes_filled = 0; pgdata = kmap(page); @@ -516,14 +516,14 @@ static int cramfs_readpage(struct file *file, struct page *page) if (compr_len == 0) ; /* hole */ - else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { + else if (unlikely(compr_len > (PAGE_SIZE << 1))) { pr_err("bad compressed blocksize %u\n", compr_len); goto err; } else { mutex_lock(&read_mutex); bytes_filled = cramfs_uncompress_block(pgdata, - PAGE_CACHE_SIZE, + PAGE_SIZE, cramfs_read(sb, start_offset, compr_len), compr_len); mutex_unlock(&read_mutex); @@ -532,7 +532,7 @@ static int cramfs_readpage(struct file *file, struct page *page) } } - memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); + memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled); flush_dcache_page(page); kunmap(page); SetPageUptodate(page); @@ -561,7 +561,7 @@ static const struct address_space_operations cramfs_aops = { static const struct file_operations cramfs_directory_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = cramfs_readdir, + .iterate_shared = cramfs_readdir, }; static const struct inode_operations cramfs_dir_inode_operations = { diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 06cd1a22240b..2fc8c43ce531 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/bio.h> #include <linux/dcache.h> +#include <linux/namei.h> #include <linux/fscrypto.h> #include <linux/ecryptfs.h> @@ -81,13 +82,14 @@ EXPORT_SYMBOL(fscrypt_release_ctx); /** * fscrypt_get_ctx() - Gets an encryption context * @inode: The inode for which we are doing the crypto + * @gfp_flags: The gfp flag for memory allocation * * Allocates and initializes an encryption context. * * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode) +struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -113,7 +115,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); if (!ctx) { - ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); if (!ctx) return ERR_PTR(-ENOMEM); ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; @@ -147,7 +149,8 @@ typedef enum { static int do_page_crypto(struct inode *inode, fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page) + struct page *src_page, struct page *dest_page, + gfp_t gfp_flags) { u8 xts_tweak[FS_XTS_TWEAK_SIZE]; struct skcipher_request *req = NULL; @@ -157,7 +160,7 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - req = skcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", @@ -175,10 +178,10 @@ static int do_page_crypto(struct inode *inode, FS_XTS_TWEAK_SIZE - sizeof(index)); sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); @@ -199,10 +202,9 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) +static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) { - ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, - GFP_NOWAIT); + ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); ctx->flags |= FS_WRITE_PATH_FL; @@ -213,6 +215,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) * fscypt_encrypt_page() - Encrypts a page * @inode: The inode for which the encryption should take place * @plaintext_page: The page to encrypt. Must be locked. + * @gfp_flags: The gfp flag for memory allocation * * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx * encryption context. @@ -225,7 +228,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) * error value or NULL. */ struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page) + struct page *plaintext_page, gfp_t gfp_flags) { struct fscrypt_ctx *ctx; struct page *ciphertext_page = NULL; @@ -233,18 +236,19 @@ struct page *fscrypt_encrypt_page(struct inode *inode, BUG_ON(!PageLocked(plaintext_page)); - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); + plaintext_page, ciphertext_page, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -275,7 +279,7 @@ int fscrypt_decrypt_page(struct page *page) BUG_ON(!PageLocked(page)); return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page); + FS_DECRYPT, page->index, page, page, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); @@ -287,13 +291,13 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, struct bio *bio; int ret, err = 0; - BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); if (IS_ERR(ciphertext_page)) { err = PTR_ERR(ciphertext_page); goto errout; @@ -301,11 +305,12 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, while (len--) { err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page); + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); if (err) goto errout; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_NOWAIT, 1); if (!bio) { err = -ENOMEM; goto errout; @@ -345,13 +350,20 @@ EXPORT_SYMBOL(fscrypt_zeroout_range); */ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct inode *dir = d_inode(dentry->d_parent); - struct fscrypt_info *ci = dir->i_crypt_info; + struct dentry *dir; + struct fscrypt_info *ci; int dir_has_key, cached_with_key; - if (!dir->i_sb->s_cop->is_encrypted(dir)) + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + dput(dir); return 0; + } + ci = d_inode(dir)->i_crypt_info; if (ci && ci->ci_keyring_key && (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | @@ -363,6 +375,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); dir_has_key = (ci != NULL); + dput(dir); /* * If the dentry was cached without the key, and it is a diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 06f5aa478bf2..1ac263eddc4e 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -78,6 +78,67 @@ out: return res; } +static int validate_user_key(struct fscrypt_info *crypt_info, + struct fscrypt_context *ctx, u8 *raw_key, + u8 *prefix, int prefix_size) +{ + u8 *full_key_descriptor; + struct key *keyring_key; + struct fscrypt_key *master_key; + const struct user_key_payload *ukp; + int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; + int res; + + full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); + if (!full_key_descriptor) + return -ENOMEM; + + memcpy(full_key_descriptor, prefix, prefix_size); + sprintf(full_key_descriptor + prefix_size, + "%*phN", FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + full_key_descriptor[full_key_len - 1] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + kfree(full_key_descriptor); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "%s: key type must be logon\n", __func__); + res = -ENOKEY; + goto out; + } + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen != sizeof(struct fscrypt_key)) { + res = -EINVAL; + up_read(&keyring_key->sem); + goto out; + } + master_key = (struct fscrypt_key *)ukp->data; + BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); + + if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "%s: key size incorrect: %d\n", + __func__, master_key->size); + res = -ENOKEY; + up_read(&keyring_key->sem); + goto out; + } + res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); + if (res) + goto out; + + crypt_info->ci_keyring_key = keyring_key; + return 0; +out: + key_put(keyring_key); + return res; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -91,12 +152,7 @@ static void put_crypt_info(struct fscrypt_info *ci) int get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; - u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE + - (FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct fscrypt_key *master_key; struct fscrypt_context ctx; - const struct user_key_payload *ukp; struct crypto_skcipher *ctfm; const char *cipher_str; u8 raw_key[FS_MAX_KEY_SIZE]; @@ -167,48 +223,24 @@ retry: memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; } - memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX, - FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE + - (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - crypt_info->ci_keyring_key = keyring_key; - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "%s: key type must be logon\n", __func__); - res = -ENOKEY; - goto out; - } - down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct fscrypt_key)) { - res = -EINVAL; - up_read(&keyring_key->sem); - goto out; - } - master_key = (struct fscrypt_key *)ukp->data; - BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { - printk_once(KERN_WARNING - "%s: key size incorrect: %d\n", - __func__, master_key->size); - res = -ENOKEY; - up_read(&keyring_key->sem); + res = validate_user_key(crypt_info, &ctx, raw_key, + FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + if (res && inode->i_sb->s_cop->key_prefix) { + u8 *prefix = NULL; + int prefix_size, res2; + + prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); + res2 = validate_user_key(crypt_info, &ctx, raw_key, + prefix, prefix_size); + if (res2) { + if (res2 == -ENOKEY) + res = -ENOKEY; + goto out; + } + } else if (res) { goto out; } - res = derive_key_aes(ctx.nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { @@ -32,6 +32,44 @@ #include <linux/pfn_t.h> #include <linux/sizes.h> +/* + * We use lowest available bit in exceptional entry for locking, other two + * bits to determine entry type. In total 3 special bits. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) +#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ + RADIX_TREE_EXCEPTIONAL_ENTRY)) + +/* We choose 4096 entries - same as per-zone page wait tables */ +#define DAX_WAIT_TABLE_BITS 12 +#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) + +wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; + +static int __init init_dax_wait_table(void) +{ + int i; + + for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) + init_waitqueue_head(wait_table + i); + return 0; +} +fs_initcall(init_dax_wait_table); + +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index) +{ + unsigned long hash = hash_long((unsigned long)mapping ^ index, + DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} + static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; @@ -78,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) return page; } -/* - * dax_clear_sectors() is called from within transaction context from XFS, - * and hence this means the stack from this point must follow GFP_NOFS - * semantics for all operations. - */ -int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) -{ - struct blk_dax_ctl dax = { - .sector = _sector, - .size = _size, - }; - - might_sleep(); - do { - long count, sz; - - count = dax_map_atomic(bdev, &dax); - if (count < 0) - return count; - sz = min_t(long, count, SZ_128K); - clear_pmem(dax.addr, sz); - dax.size -= sz; - dax.sector += sz / 512; - dax_unmap_atomic(bdev, &dax); - cond_resched(); - } while (dax.size); - - wmb_pmem(); - return 0; -} -EXPORT_SYMBOL_GPL(dax_clear_sectors); - -/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ -static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, - loff_t pos, loff_t end) -{ - loff_t final = end - pos + first; /* The final byte of the buffer */ - - if (first > 0) - clear_pmem(addr, first); - if (final < size) - clear_pmem(addr + final, size - final); -} - static bool buffer_written(struct buffer_head *bh) { return buffer_mapped(bh) && !buffer_unwritten(bh); @@ -160,6 +154,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; + unsigned blkbits = inode->i_blkbits; + sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) + >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); @@ -167,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, while (pos < end) { size_t len; if (pos == max) { - unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); @@ -183,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; + /* + * We allow uninitialized buffers for writes + * beyond EOF as those cannot race with faults + */ + WARN_ON_ONCE( + (buffer_new(bh) && block < file_blks) || + (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -202,11 +205,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, rc = map_len; break; } - if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(dax.addr, map_len, first, - pos, end); - need_wmb = true; - } dax.addr += first; size = map_len - first; } @@ -244,7 +242,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, * @iocb: The control block for this I/O * @inode: The file which the I/O is directed at * @iter: The addresses to do I/O from or to - * @pos: The file offset where the I/O starts * @get_block: The filesystem method used to translate file offsets to blocks * @end_io: A filesystem callback for I/O completion * @flags: See below @@ -257,25 +254,19 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, * is in progress. */ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, - struct iov_iter *iter, loff_t pos, get_block_t get_block, + struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags) { struct buffer_head bh; ssize_t retval = -EINVAL; + loff_t pos = iocb->ki_pos; loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { - struct address_space *mapping = inode->i_mapping; + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_lock(inode); - retval = filemap_write_and_wait_range(mapping, pos, end - 1); - if (retval) { - inode_unlock(inode); - goto out; - } - } /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) @@ -296,12 +287,268 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); - out: return retval; } EXPORT_SYMBOL_GPL(dax_do_io); /* + * DAX radix tree locking + */ +struct exceptional_entry_key { + struct address_space *mapping; + unsigned long index; +}; + +struct wait_exceptional_entry_queue { + wait_queue_t wait; + struct exceptional_entry_key key; +}; + +static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, + int sync, void *keyp) +{ + struct exceptional_entry_key *key = keyp; + struct wait_exceptional_entry_queue *ewait = + container_of(wait, struct wait_exceptional_entry_queue, wait); + + if (key->mapping != ewait->key.mapping || + key->index != ewait->key.index) + return 0; + return autoremove_wake_function(wait, mode, sync, NULL); +} + +/* + * Check whether the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline int slot_locked(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + return entry & RADIX_DAX_ENTRY_LOCK; +} + +/* + * Mark the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline void *lock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry |= RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Mark the given slot is unlocked. The function must be called with + * mapping->tree_lock held + */ +static inline void *unlock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Lookup entry in radix tree, wait for it to become unlocked if it is + * exceptional entry and return it. The caller must call + * put_unlocked_mapping_entry() when he decided not to lock the entry or + * put_locked_mapping_entry() when he locked the entry and now wants to + * unlock it. + * + * The function must be called with mapping->tree_lock held. + */ +static void *get_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void ***slotp) +{ + void *ret, **slot; + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + ewait.key.mapping = mapping; + ewait.key.index = index; + + for (;;) { + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + &slot); + if (!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot)) { + if (slotp) + *slotp = slot; + return ret; + } + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mapping->tree_lock); + schedule(); + finish_wait(wq, &ewait.wait); + spin_lock_irq(&mapping->tree_lock); + } +} + +/* + * Find radix tree entry at given index. If it points to a page, return with + * the page locked. If it points to the exceptional entry, return with the + * radix tree entry locked. If the radix tree doesn't contain given index, + * create empty exceptional entry for the index and return with it locked. + * + * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For + * persistent memory the benefit is doubtful. We can add that later if we can + * show it helps. + */ +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + +restart: + spin_lock_irq(&mapping->tree_lock); + ret = get_unlocked_mapping_entry(mapping, index, &slot); + /* No entry for given index? Make sure radix tree is big enough. */ + if (!ret) { + int err; + + spin_unlock_irq(&mapping->tree_lock); + err = radix_tree_preload( + mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); + if (err) + return ERR_PTR(err); + ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK); + spin_lock_irq(&mapping->tree_lock); + err = radix_tree_insert(&mapping->page_tree, index, ret); + radix_tree_preload_end(); + if (err) { + spin_unlock_irq(&mapping->tree_lock); + /* Someone already created the entry? */ + if (err == -EEXIST) + goto restart; + return ERR_PTR(err); + } + /* Good, we have inserted empty locked entry into the tree. */ + mapping->nrexceptional++; + spin_unlock_irq(&mapping->tree_lock); + return ret; + } + /* Normal page in radix tree? */ + if (!radix_tree_exceptional_entry(ret)) { + struct page *page = ret; + + get_page(page); + spin_unlock_irq(&mapping->tree_lock); + lock_page(page); + /* Page got truncated? Retry... */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto restart; + } + return page; + } + ret = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all) +{ + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) { + struct exceptional_entry_key key; + + key.mapping = mapping; + key.index = index; + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + } +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + + spin_lock_irq(&mapping->tree_lock); + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + dax_unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +/* + * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree + * entry to get unlocked before deleting it. + */ +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *entry; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + /* + * This gets called from truncate / punch_hole path. As such, the caller + * must hold locks protecting against concurrent modifications of the + * radix tree (usually fs-private i_mmap_sem for writing). Since the + * caller has seen exceptional entry for this index, we better find it + * at that index as well... + */ + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, true); + + return 1; +} + +/* * The user has performed a load from a hole in the file. Allocating * a new page in the file would cause excessive storage usage for * workloads with sparse files. We allocate a page cache page instead. @@ -309,24 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io); * otherwise it will simply fall out of the page cache under memory * pressure without ever having been dirtied. */ -static int dax_load_hole(struct address_space *mapping, struct page *page, - struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - unsigned long size; - struct inode *inode = mapping->host; - if (!page) - page = find_or_create_page(mapping, vmf->pgoff, - GFP_KERNEL | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - /* Recheck i_size under page lock to avoid truncate race */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) { - unlock_page(page); - page_cache_release(page); - return VM_FAULT_SIGBUS; + struct page *page; + + /* Hole page already exists? Return it... */ + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } + /* This will replace locked radix tree entry with a hole page */ + page = find_or_create_page(mapping, vmf->pgoff, + vmf->gfp_mask | __GFP_ZERO); + if (!page) { + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + return VM_FAULT_OOM; + } vmf->page = page; return VM_FAULT_LOCKED; } @@ -350,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } -#define NO_SECTOR -1 -#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) -static int dax_radix_entry(struct address_space *mapping, pgoff_t index, - sector_t sector, bool pmd_entry, bool dirty) +static void *dax_insert_mapping_entry(struct address_space *mapping, + struct vm_fault *vmf, + void *entry, sector_t sector) { struct radix_tree_root *page_tree = &mapping->page_tree; - pgoff_t pmd_index = DAX_PMD_INDEX(index); - int type, error = 0; - void *entry; + int error = 0; + bool hole_fill = false; + void *new_entry; + pgoff_t index = vmf->pgoff; - WARN_ON_ONCE(pmd_entry && !dirty); - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - spin_lock_irq(&mapping->tree_lock); - - entry = radix_tree_lookup(page_tree, pmd_index); - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { - index = pmd_index; - goto dirty; + /* Replacing hole page with block mapping? */ + if (!radix_tree_exceptional_entry(entry)) { + hole_fill = true; + /* + * Unmap the page now before we remove it from page cache below. + * The page is locked so it cannot be faulted in again. + */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); + error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); + if (error) + return ERR_PTR(error); } - entry = radix_tree_lookup(page_tree, index); - if (entry) { - type = RADIX_DAX_TYPE(entry); - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && - type != RADIX_DAX_PMD)) { - error = -EIO; + spin_lock_irq(&mapping->tree_lock); + new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | + RADIX_DAX_ENTRY_LOCK); + if (hole_fill) { + __delete_from_page_cache(entry, NULL); + /* Drop pagecache reference */ + put_page(entry); + error = radix_tree_insert(page_tree, index, new_entry); + if (error) { + new_entry = ERR_PTR(error); goto unlock; } + mapping->nrexceptional++; + } else { + void **slot; + void *ret; - if (!pmd_entry || type == RADIX_DAX_PMD) - goto dirty; - - /* - * We only insert dirty PMD entries into the radix tree. This - * means we don't need to worry about removing a dirty PTE - * entry and inserting a clean PMD entry, thus reducing the - * range we would flush with a follow-up fsync/msync call. - */ - radix_tree_delete(&mapping->page_tree, index); - mapping->nrexceptional--; - } - - if (sector == NO_SECTOR) { - /* - * This can happen during correct operation if our pfn_mkwrite - * fault raced against a hole punch operation. If this - * happens the pte that was hole punched will have been - * unmapped and the radix tree entry will have been removed by - * the time we are called, but the call will still happen. We - * will return all the way up to wp_pfn_shared(), where the - * pte_same() check will fail, eventually causing page fault - * to be retried by the CPU. - */ - goto unlock; + ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + WARN_ON_ONCE(ret != entry); + radix_tree_replace_slot(slot, new_entry); } - - error = radix_tree_insert(page_tree, index, - RADIX_DAX_ENTRY(sector, pmd_entry)); - if (error) - goto unlock; - - mapping->nrexceptional++; - dirty: - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); - return error; + if (hole_fill) { + radix_tree_preload_end(); + /* + * We don't need hole page anymore, it has been replaced with + * locked radix tree entry now. + */ + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(entry); + unlock_page(entry); + put_page(entry); + } + return new_entry; } static int dax_writeback_one(struct block_device *bdev, @@ -506,8 +748,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; - start_index = wbc->range_start >> PAGE_CACHE_SHIFT; - end_index = wbc->range_end >> PAGE_CACHE_SHIFT; + start_index = wbc->range_start >> PAGE_SHIFT; + end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); @@ -546,56 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, +static int dax_insert_mapping(struct address_space *mapping, + struct buffer_head *bh, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), + .sector = to_sector(bh, mapping->host), .size = bh->b_size, }; - pgoff_t size; - int error; - - i_mmap_lock_read(mapping); - - /* - * Check truncate didn't happen while we were allocating a block. - * If it did, this block may or may not be still allocated to the - * file. We can't tell the filesystem to free it because we can't - * take i_mutex here. In the worst case, the file still has blocks - * allocated past the end of the file. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - error = -EIO; - goto out; - } - - if (dax_map_atomic(bdev, &dax) < 0) { - error = PTR_ERR(dax.addr); - goto out; - } + void *ret; + void *entry = *entryp; - if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(dax.addr, PAGE_SIZE); - wmb_pmem(); - } + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); - error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, - vmf->flags & FAULT_FLAG_WRITE); - if (error) - goto out; + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + if (IS_ERR(ret)) + return PTR_ERR(ret); + *entryp = ret; - error = vm_insert_mixed(vma, vaddr, dax.pfn); - - out: - i_mmap_unlock_read(mapping); - - return error; + return vm_insert_mixed(vma, vaddr, dax.pfn); } /** @@ -603,24 +818,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks - * @complete_unwritten: The filesystem method used to convert unwritten blocks - * to written so the data written to them is exposed. This is required for - * required by write faults for filesystems that will return unwritten - * extent mappings from @get_block, but it is optional for reads as - * dax_insert_mapping() will always zero unwritten blocks. If the fs does - * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct page *page; + void *entry; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; @@ -629,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, int error; int major = 0; + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -638,49 +852,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; - repeat: - page = find_get_page(mapping, vmf->pgoff); - if (page) { - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { - page_cache_release(page); - return VM_FAULT_RETRY; - } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - /* - * We have a struct page covering a hole in the file - * from a read fault and we've raced with a truncate - */ - error = -EIO; - goto unlock_page; - } + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; - - if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_page; - } else { - return dax_load_hole(mapping, page, vmf); - } - } + goto unlock_entry; if (vmf->cow_page) { struct page *new_page = vmf->cow_page; @@ -689,53 +871,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; - vmf->page = page; - if (!page) { - i_mmap_lock_read(mapping); - /* Check we didn't race with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); - error = -EIO; - goto out; - } + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } - return VM_FAULT_LOCKED; - } - - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) - page = find_lock_page(mapping, vmf->pgoff); - - if (page) { - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_CACHE_SIZE, 0); - delete_from_page_cache(page); - unlock_page(page); - page_cache_release(page); - page = NULL; + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; } - /* - * If we successfully insert the new mapping over an unwritten extent, - * we need to ensure we convert the unwritten extent. If there is an - * error inserting the mapping, the filesystem needs to leave it as - * unwritten to prevent exposure of the stale underlying data to - * userspace, but we still need to call the completion function so - * the private resources on the mapping buffer can be released. We - * indicate what the callback should do via the uptodate variable, same - * as for normal BH based IO completions. - */ - error = dax_insert_mapping(inode, &bh, vma, vmf); - if (buffer_unwritten(&bh)) { - if (complete_unwritten) - complete_unwritten(&bh, !error); - else - WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + if (!buffer_mapped(&bh)) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_entry; + } else { + return dax_load_hole(mapping, entry, vmf); + } } + /* Filesystem should not return unwritten buffers to us! */ + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); + error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -743,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - - unlock_page: - if (page) { - unlock_page(page); - page_cache_release(page); - } - goto out; } EXPORT_SYMBOL(__dax_fault); @@ -763,7 +920,7 @@ EXPORT_SYMBOL(__dax_fault); * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -772,7 +929,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_fault(vma, vmf, get_block, complete_unwritten); + result = __dax_fault(vma, vmf, get_block); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -780,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * The 'colour' (ie low bits) within a PMD of a page offset. This comes up * more often than one might expect in the below function. @@ -806,8 +963,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -819,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int error, result = 0; + int result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ @@ -866,6 +1022,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); } bdev = bh.b_bdev; @@ -891,26 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, truncate_pagecache_range(inode, lstart, lend); } - i_mmap_lock_read(mapping); - - /* - * If a truncate happened while we were allocating blocks, we may - * leave blocks allocated to the file that are beyond EOF. We can't - * take i_mutex here, so just leave them hanging; they'll be freed - * when the file is deleted. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) { - result = VM_FAULT_SIGBUS; - goto out; - } - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, - "offset + huge page size > file size"); - goto fallback; - } - - if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); @@ -945,8 +1083,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, long length = dax_map_atomic(bdev, &dax); if (length < 0) { - result = VM_FAULT_SIGBUS; - goto out; + dax_pmd_dbg(&bh, address, "dax-error fallback"); + goto fallback; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); @@ -964,14 +1102,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } - - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - clear_pmem(dax.addr, PMD_SIZE); - wmb_pmem(); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } dax_unmap_atomic(bdev, &dax); /* @@ -990,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * the write to insert a dirty entry. */ if (write) { - error = dax_radix_entry(mapping, pgoff, dax.sector, - true, true); - if (error) { - dax_pmd_dbg(&bh, address, - "PMD radix insertion failed"); - goto fallback; - } + /* + * We should insert radix-tree entry and dirty it here. + * For now this is broken... + */ } dev_dbg(part_to_dev(bdev->bd_part), @@ -1009,11 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } out: - i_mmap_unlock_read(mapping); - - if (buffer_unwritten(&bh)) - complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - return result; fallback: @@ -1033,8 +1155,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault); * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -1043,8 +1164,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_pmd_fault(vma, address, pmd, flags, get_block, - complete_unwritten); + result = __dax_pmd_fault(vma, address, pmd, flags, get_block); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -1061,27 +1181,59 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; - int error; - - /* - * We pass NO_SECTOR to dax_radix_entry() because we expect that a - * RADIX_DAX_PTE entry already exists in the radix tree from a - * previous call to __dax_fault(). We just want to look up that PTE - * entry using vmf->pgoff and make sure the dirty tag is set. This - * saves us from having to make a call to get_block() here to look - * up the sector. - */ - error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, - true); + struct address_space *mapping = file->f_mapping; + void *entry; + pgoff_t index = vmf->pgoff; - if (error == -ENOMEM) - return VM_FAULT_OOM; - if (error) - return VM_FAULT_SIGBUS; + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry || !radix_tree_exceptional_entry(entry)) + goto out; + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + put_unlocked_mapping_entry(mapping, index, entry); +out: + spin_unlock_irq(&mapping->tree_lock); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +static bool dax_range_is_aligned(struct block_device *bdev, + unsigned int offset, unsigned int length) +{ + unsigned short sector_size = bdev_logical_block_size(bdev); + + if (!IS_ALIGNED(offset, sector_size)) + return false; + if (!IS_ALIGNED(length, sector_size)) + return false; + + return true; +} + +int __dax_zero_page_range(struct block_device *bdev, sector_t sector, + unsigned int offset, unsigned int length) +{ + struct blk_dax_ctl dax = { + .sector = sector, + .size = PAGE_SIZE, + }; + + if (dax_range_is_aligned(bdev, offset, length)) { + sector_t start_sector = dax.sector + (offset >> 9); + + return blkdev_issue_zeroout(bdev, start_sector, + length >> 9, GFP_NOFS, true); + } else { + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); + wmb_pmem(); + dax_unmap_atomic(bdev, &dax); + } + return 0; +} +EXPORT_SYMBOL_GPL(__dax_zero_page_range); + /** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated @@ -1093,47 +1245,29 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); * page in a DAX file. This is intended for hole-punch operations. If * you are truncating a file, the helper function dax_truncate_page() may be * more convenient. - * - * We work in terms of PAGE_CACHE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, get_block_t get_block) { struct buffer_head bh; - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + pgoff_t index = from >> PAGE_SHIFT; + unsigned offset = from & (PAGE_SIZE-1); int err; /* Block boundary? Nothing to do */ if (!length) return 0; - BUG_ON((offset + length) > PAGE_CACHE_SIZE); + BUG_ON((offset + length) > PAGE_SIZE); memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; - bh.b_size = PAGE_CACHE_SIZE; + bh.b_size = PAGE_SIZE; err = get_block(inode, index, &bh, 0); - if (err < 0) + if (err < 0 || !buffer_written(&bh)) return err; - if (buffer_written(&bh)) { - struct block_device *bdev = bh.b_bdev; - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PAGE_CACHE_SIZE, - }; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - wmb_pmem(); - dax_unmap_atomic(bdev, &dax); - } - - return 0; + return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), + offset, length); } EXPORT_SYMBOL_GPL(dax_zero_page_range); @@ -1145,16 +1279,10 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range); * * Similar to block_truncate_page(), this function can be called by a * filesystem when it is truncating a DAX file to handle the partial page. - * - * We work in terms of PAGE_CACHE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) { - unsigned length = PAGE_CACHE_ALIGN(from) - from; + unsigned length = PAGE_ALIGN(from) - from; return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/fs/dcache.c b/fs/dcache.c index 32ceae3e6112..c622872c12c5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -111,6 +111,17 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, return dentry_hashtable + hash_32(hash, d_hash_shift); } +#define IN_LOOKUP_SHIFT 10 +static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; + +static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, + unsigned int hash) +{ + hash += (unsigned long) parent / L1_CACHE_BYTES; + return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); +} + + /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { .age_limit = 45, @@ -761,6 +772,8 @@ repeat: /* Slow case: now with the dentry lock held */ rcu_read_unlock(); + WARN_ON(d_in_lookup(dentry)); + /* Unreachable? Get rid of it */ if (unlikely(d_unhashed(dentry))) goto kill_it; @@ -1558,7 +1571,11 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) * be overwriting an internal NUL character */ dentry->d_iname[DNAME_INLINE_LEN-1] = 0; - if (name->len > DNAME_INLINE_LEN-1) { + if (unlikely(!name)) { + static const struct qstr anon = QSTR_INIT("/", 1); + name = &anon; + dname = dentry->d_iname; + } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); @@ -1667,7 +1684,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | - DCACHE_OP_SELECT_INODE)); + DCACHE_OP_SELECT_INODE | + DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; @@ -1685,6 +1703,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_select_inode) dentry->d_flags |= DCACHE_OP_SELECT_INODE; + if (op->d_real) + dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); @@ -1743,6 +1763,7 @@ type_determined: static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); + WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); @@ -1772,11 +1793,11 @@ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { + security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } - security_d_instantiate(entry, inode); } EXPORT_SYMBOL(d_instantiate); @@ -1793,6 +1814,7 @@ int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) { spin_unlock(&inode->i_lock); @@ -1801,7 +1823,6 @@ int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode) } __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); - security_d_instantiate(entry, inode); return 0; } @@ -1812,9 +1833,7 @@ struct dentry *d_make_root(struct inode *root_inode) struct dentry *res = NULL; if (root_inode) { - static const struct qstr name = QSTR_INIT("/", 1); - - res = __d_alloc(root_inode->i_sb, &name); + res = __d_alloc(root_inode->i_sb, NULL); if (res) d_instantiate(res, root_inode); else @@ -1855,7 +1874,6 @@ EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) { - static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; struct dentry *res; unsigned add_flags; @@ -1869,12 +1887,13 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) if (res) goto out_iput; - tmp = __d_alloc(inode->i_sb, &anonstring); + tmp = __d_alloc(inode->i_sb, NULL); if (!tmp) { res = ERR_PTR(-ENOMEM); goto out_iput; } + security_d_instantiate(tmp, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); if (res) { @@ -1897,13 +1916,10 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) hlist_bl_unlock(&tmp->d_sb->s_anon); spin_unlock(&tmp->d_lock); spin_unlock(&inode->i_lock); - security_d_instantiate(tmp, inode); return tmp; out_iput: - if (res && !IS_ERR(res)) - security_d_instantiate(res, inode); iput(inode); return res; } @@ -1972,28 +1988,36 @@ EXPORT_SYMBOL(d_obtain_root); struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { - struct dentry *found; - struct dentry *new; + struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); - if (!found) { - new = d_alloc(dentry->d_parent, name); - if (!new) { - found = ERR_PTR(-ENOMEM); - } else { - found = d_splice_alias(inode, new); - if (found) { - dput(new); - return found; - } - return new; + if (found) { + iput(inode); + return found; + } + if (d_in_lookup(dentry)) { + found = d_alloc_parallel(dentry->d_parent, name, + dentry->d_wait); + if (IS_ERR(found) || !d_in_lookup(found)) { + iput(inode); + return found; } + } else { + found = d_alloc(dentry->d_parent, name); + if (!found) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + } + res = d_splice_alias(inode, found); + if (res) { + dput(found); + return res; } - iput(inode); return found; } EXPORT_SYMBOL(d_add_ci); @@ -2360,17 +2384,194 @@ void d_rehash(struct dentry * entry) } EXPORT_SYMBOL(d_rehash); +static inline unsigned start_dir_add(struct inode *dir) +{ + + for (;;) { + unsigned n = dir->i_dir_seq; + if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + return n; + cpu_relax(); + } +} + +static inline void end_dir_add(struct inode *dir, unsigned n) +{ + smp_store_release(&dir->i_dir_seq, n + 2); +} + +static void d_wait_lookup(struct dentry *dentry) +{ + if (d_in_lookup(dentry)) { + DECLARE_WAITQUEUE(wait, current); + add_wait_queue(dentry->d_wait, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&dentry->d_lock); + schedule(); + spin_lock(&dentry->d_lock); + } while (d_in_lookup(dentry)); + } +} + +struct dentry *d_alloc_parallel(struct dentry *parent, + const struct qstr *name, + wait_queue_head_t *wq) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_bl_head *b = in_lookup_hash(parent, hash); + struct hlist_bl_node *node; + struct dentry *new = d_alloc(parent, name); + struct dentry *dentry; + unsigned seq, r_seq, d_seq; + + if (unlikely(!new)) + return ERR_PTR(-ENOMEM); + +retry: + rcu_read_lock(); + seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1; + r_seq = read_seqbegin(&rename_lock); + dentry = __d_lookup_rcu(parent, name, &d_seq); + if (unlikely(dentry)) { + if (!lockref_get_not_dead(&dentry->d_lockref)) { + rcu_read_unlock(); + goto retry; + } + if (read_seqcount_retry(&dentry->d_seq, d_seq)) { + rcu_read_unlock(); + dput(dentry); + goto retry; + } + rcu_read_unlock(); + dput(new); + return dentry; + } + if (unlikely(read_seqretry(&rename_lock, r_seq))) { + rcu_read_unlock(); + goto retry; + } + hlist_bl_lock(b); + if (unlikely(parent->d_inode->i_dir_seq != seq)) { + hlist_bl_unlock(b); + rcu_read_unlock(); + goto retry; + } + rcu_read_unlock(); + /* + * No changes for the parent since the beginning of d_lookup(). + * Since all removals from the chain happen with hlist_bl_lock(), + * any potential in-lookup matches are going to stay here until + * we unlock the chain. All fields are stable in everything + * we encounter. + */ + hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { + if (dentry->d_name.hash != hash) + continue; + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + if (parent->d_flags & DCACHE_OP_COMPARE) { + int tlen = dentry->d_name.len; + const char *tname = dentry->d_name.name; + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) + continue; + } else { + if (dentry->d_name.len != len) + continue; + if (dentry_cmp(dentry, str, len)) + continue; + } + dget(dentry); + hlist_bl_unlock(b); + /* somebody is doing lookup for it right now; wait for it */ + spin_lock(&dentry->d_lock); + d_wait_lookup(dentry); + /* + * it's not in-lookup anymore; in principle we should repeat + * everything from dcache lookup, but it's likely to be what + * d_lookup() would've found anyway. If it is, just return it; + * otherwise we really have to repeat the whole thing. + */ + if (unlikely(dentry->d_name.hash != hash)) + goto mismatch; + if (unlikely(dentry->d_parent != parent)) + goto mismatch; + if (unlikely(d_unhashed(dentry))) + goto mismatch; + if (parent->d_flags & DCACHE_OP_COMPARE) { + int tlen = dentry->d_name.len; + const char *tname = dentry->d_name.name; + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) + goto mismatch; + } else { + if (unlikely(dentry->d_name.len != len)) + goto mismatch; + if (unlikely(dentry_cmp(dentry, str, len))) + goto mismatch; + } + /* OK, it *is* a hashed match; return it */ + spin_unlock(&dentry->d_lock); + dput(new); + return dentry; + } + /* we can't take ->d_lock here; it's OK, though. */ + new->d_flags |= DCACHE_PAR_LOOKUP; + new->d_wait = wq; + hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); + hlist_bl_unlock(b); + return new; +mismatch: + spin_unlock(&dentry->d_lock); + dput(dentry); + goto retry; +} +EXPORT_SYMBOL(d_alloc_parallel); + +void __d_lookup_done(struct dentry *dentry) +{ + struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent, + dentry->d_name.hash); + hlist_bl_lock(b); + dentry->d_flags &= ~DCACHE_PAR_LOOKUP; + __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); + wake_up_all(dentry->d_wait); + dentry->d_wait = NULL; + hlist_bl_unlock(b); + INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_LIST_HEAD(&dentry->d_lru); +} +EXPORT_SYMBOL(__d_lookup_done); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { + struct inode *dir = NULL; + unsigned n; + spin_lock(&dentry->d_lock); + if (unlikely(d_in_lookup(dentry))) { + dir = dentry->d_parent->d_inode; + n = start_dir_add(dir); + __d_lookup_done(dentry); + } if (inode) { - __d_instantiate(dentry, inode); + unsigned add_flags = d_flags_for_inode(inode); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); + __d_set_inode_and_type(dentry, inode, add_flags); + raw_write_seqcount_end(&dentry->d_seq); + __fsnotify_d_instantiate(dentry); + } + _d_rehash(dentry); + if (dir) + end_dir_add(dir, n); + spin_unlock(&dentry->d_lock); + if (inode) spin_unlock(&inode->i_lock); - } - security_d_instantiate(dentry, inode); - d_rehash(dentry); } /** @@ -2384,8 +2585,10 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) void d_add(struct dentry *entry, struct inode *inode) { - if (inode) + if (inode) { + security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); @@ -2595,6 +2798,8 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { + struct inode *dir = NULL; + unsigned n; if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2602,6 +2807,11 @@ static void __d_move(struct dentry *dentry, struct dentry *target, BUG_ON(d_ancestor(target, dentry)); dentry_lock_for_move(dentry, target); + if (unlikely(d_in_lookup(target))) { + dir = target->d_parent->d_inode; + n = start_dir_add(dir); + __d_lookup_done(target); + } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); @@ -2651,6 +2861,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); + if (dir) + end_dir_add(dir, n); dentry_unlock_for_move(dentry, target); } @@ -2721,7 +2933,8 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) static int __d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { - struct mutex *m1 = NULL, *m2 = NULL; + struct mutex *m1 = NULL; + struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ @@ -2732,15 +2945,15 @@ static int __d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!inode_trylock(alias->d_parent->d_inode)) + if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; - m2 = &alias->d_parent->d_inode->i_mutex; + m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: if (m2) - mutex_unlock(m2); + up_read(m2); if (m1) mutex_unlock(m1); return ret; @@ -2779,6 +2992,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (!inode) goto out; + security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); @@ -2806,7 +3020,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); - security_d_instantiate(new, inode); } iput(inode); return new; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index d2ba12e23ed9..9c1c9a01b7e5 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -22,6 +22,12 @@ #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> +#include <linux/srcu.h> +#include <asm/poll.h> + +#include "internal.h" + +struct poll_table_struct; static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -35,27 +41,293 @@ static ssize_t default_write_file(struct file *file, const char __user *buf, return count; } -const struct file_operations debugfs_file_operations = { +const struct file_operations debugfs_noop_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; -static struct dentry *debugfs_create_mode(const char *name, umode_t mode, - struct dentry *parent, void *value, - const struct file_operations *fops, - const struct file_operations *fops_ro, - const struct file_operations *fops_wo) +/** + * debugfs_use_file_start - mark the beginning of file data access + * @dentry: the dentry object whose data is being accessed. + * @srcu_idx: a pointer to some memory to store a SRCU index in. + * + * Up to a matching call to debugfs_use_file_finish(), any + * successive call into the file removing functions debugfs_remove() + * and debugfs_remove_recursive() will block. Since associated private + * file data may only get freed after a successful return of any of + * the removal functions, you may safely access it after a successful + * call to debugfs_use_file_start() without worrying about + * lifetime issues. + * + * If -%EIO is returned, the file has already been removed and thus, + * it is not safe to access any of its data. If, on the other hand, + * it is allowed to access the file data, zero is returned. + * + * Regardless of the return code, any call to + * debugfs_use_file_start() must be followed by a matching call + * to debugfs_use_file_finish(). + */ +int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx) + __acquires(&debugfs_srcu) +{ + *srcu_idx = srcu_read_lock(&debugfs_srcu); + barrier(); + if (d_unlinked(dentry)) + return -EIO; + return 0; +} +EXPORT_SYMBOL_GPL(debugfs_use_file_start); + +/** + * debugfs_use_file_finish - mark the end of file data access + * @srcu_idx: the SRCU index "created" by a former call to + * debugfs_use_file_start(). + * + * Allow any ongoing concurrent call into debugfs_remove() or + * debugfs_remove_recursive() blocked by a former call to + * debugfs_use_file_start() to proceed and return to its caller. + */ +void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu) +{ + srcu_read_unlock(&debugfs_srcu, srcu_idx); +} +EXPORT_SYMBOL_GPL(debugfs_use_file_finish); + +#define F_DENTRY(filp) ((filp)->f_path.dentry) + +#define REAL_FOPS_DEREF(dentry) \ + ((const struct file_operations *)(dentry)->d_fsdata) + +static int open_proxy_open(struct inode *inode, struct file *filp) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = NULL; + int srcu_idx, r; + + r = debugfs_use_file_start(dentry, &srcu_idx); + if (r) { + r = -ENOENT; + goto out; + } + + real_fops = REAL_FOPS_DEREF(dentry); + real_fops = fops_get(real_fops); + if (!real_fops) { + /* Huh? Module did not clean up after itself at exit? */ + WARN(1, "debugfs file owner did not clean up at exit: %pd", + dentry); + r = -ENXIO; + goto out; + } + replace_fops(filp, real_fops); + + if (real_fops->open) + r = real_fops->open(inode, filp); + +out: + fops_put(real_fops); + debugfs_use_file_finish(srcu_idx); + return r; +} + +const struct file_operations debugfs_open_proxy_file_operations = { + .open = open_proxy_open, +}; + +#define PROTO(args...) args +#define ARGS(args...) args + +#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \ +static ret_type full_proxy_ ## name(proto) \ +{ \ + const struct dentry *dentry = F_DENTRY(filp); \ + const struct file_operations *real_fops = \ + REAL_FOPS_DEREF(dentry); \ + int srcu_idx; \ + ret_type r; \ + \ + r = debugfs_use_file_start(dentry, &srcu_idx); \ + if (likely(!r)) \ + r = real_fops->name(args); \ + debugfs_use_file_finish(srcu_idx); \ + return r; \ +} + +FULL_PROXY_FUNC(llseek, loff_t, filp, + PROTO(struct file *filp, loff_t offset, int whence), + ARGS(filp, offset, whence)); + +FULL_PROXY_FUNC(read, ssize_t, filp, + PROTO(struct file *filp, char __user *buf, size_t size, + loff_t *ppos), + ARGS(filp, buf, size, ppos)); + +FULL_PROXY_FUNC(write, ssize_t, filp, + PROTO(struct file *filp, const char __user *buf, size_t size, + loff_t *ppos), + ARGS(filp, buf, size, ppos)); + +FULL_PROXY_FUNC(unlocked_ioctl, long, filp, + PROTO(struct file *filp, unsigned int cmd, unsigned long arg), + ARGS(filp, cmd, arg)); + +static unsigned int full_proxy_poll(struct file *filp, + struct poll_table_struct *wait) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + int srcu_idx; + unsigned int r = 0; + + if (debugfs_use_file_start(dentry, &srcu_idx)) { + debugfs_use_file_finish(srcu_idx); + return POLLHUP; + } + + r = real_fops->poll(filp, wait); + debugfs_use_file_finish(srcu_idx); + return r; +} + +static int full_proxy_release(struct inode *inode, struct file *filp) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *proxy_fops = filp->f_op; + int r = 0; + + /* + * We must not protect this against removal races here: the + * original releaser should be called unconditionally in order + * not to leak any resources. Releasers must not assume that + * ->i_private is still being meaningful here. + */ + if (real_fops->release) + r = real_fops->release(inode, filp); + + replace_fops(filp, d_inode(dentry)->i_fop); + kfree((void *)proxy_fops); + fops_put(real_fops); + return 0; +} + +static void __full_proxy_fops_init(struct file_operations *proxy_fops, + const struct file_operations *real_fops) +{ + proxy_fops->release = full_proxy_release; + if (real_fops->llseek) + proxy_fops->llseek = full_proxy_llseek; + if (real_fops->read) + proxy_fops->read = full_proxy_read; + if (real_fops->write) + proxy_fops->write = full_proxy_write; + if (real_fops->poll) + proxy_fops->poll = full_proxy_poll; + if (real_fops->unlocked_ioctl) + proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; +} + +static int full_proxy_open(struct inode *inode, struct file *filp) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = NULL; + struct file_operations *proxy_fops = NULL; + int srcu_idx, r; + + r = debugfs_use_file_start(dentry, &srcu_idx); + if (r) { + r = -ENOENT; + goto out; + } + + real_fops = REAL_FOPS_DEREF(dentry); + real_fops = fops_get(real_fops); + if (!real_fops) { + /* Huh? Module did not cleanup after itself at exit? */ + WARN(1, "debugfs file owner did not clean up at exit: %pd", + dentry); + r = -ENXIO; + goto out; + } + + proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL); + if (!proxy_fops) { + r = -ENOMEM; + goto free_proxy; + } + __full_proxy_fops_init(proxy_fops, real_fops); + replace_fops(filp, proxy_fops); + + if (real_fops->open) { + r = real_fops->open(inode, filp); + + if (filp->f_op != proxy_fops) { + /* No protection against file removal anymore. */ + WARN(1, "debugfs file owner replaced proxy fops: %pd", + dentry); + goto free_proxy; + } + } + + goto out; +free_proxy: + kfree(proxy_fops); + fops_put(real_fops); +out: + debugfs_use_file_finish(srcu_idx); + return r; +} + +const struct file_operations debugfs_full_proxy_file_operations = { + .open = full_proxy_open, +}; + +ssize_t debugfs_attr_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + int srcu_idx; + + ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!ret)) + ret = simple_attr_read(file, buf, len, ppos); + debugfs_use_file_finish(srcu_idx); + return ret; +} +EXPORT_SYMBOL_GPL(debugfs_attr_read); + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + int srcu_idx; + + ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!ret)) + ret = simple_attr_write(file, buf, len, ppos); + debugfs_use_file_finish(srcu_idx); + return ret; +} +EXPORT_SYMBOL_GPL(debugfs_attr_write); + +static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, + struct dentry *parent, void *value, + const struct file_operations *fops, + const struct file_operations *fops_ro, + const struct file_operations *fops_wo) { /* if there are no write bits set, make read only */ if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, fops_ro); + return debugfs_create_file_unsafe(name, mode, parent, value, + fops_ro); /* if there are no read bits set, make write only */ if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, fops_wo); + return debugfs_create_file_unsafe(name, mode, parent, value, + fops_wo); - return debugfs_create_file(name, mode, parent, value, fops); + return debugfs_create_file_unsafe(name, mode, parent, value, fops); } static int debugfs_u8_set(void *data, u64 val) @@ -68,9 +340,9 @@ static int debugfs_u8_get(void *data, u64 *val) *val = *(u8 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value @@ -99,7 +371,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u8, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -114,9 +386,9 @@ static int debugfs_u16_get(void *data, u64 *val) *val = *(u16 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value @@ -145,7 +417,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u16, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -160,9 +432,9 @@ static int debugfs_u32_get(void *data, u64 *val) *val = *(u32 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value @@ -191,7 +463,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u32, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); @@ -207,9 +479,9 @@ static int debugfs_u64_get(void *data, u64 *val) *val = *(u64 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value @@ -238,7 +510,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u64, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); @@ -254,9 +526,10 @@ static int debugfs_ulong_get(void *data, u64 *val) *val = *(unsigned long *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, + "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); /** * debugfs_create_ulong - create a debugfs file that is used to read and write @@ -286,26 +559,30 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); struct dentry *debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_ulong, - &fops_ulong_ro, &fops_ulong_wo); + return debugfs_create_mode_unsafe(name, mode, parent, value, + &fops_ulong, &fops_ulong_ro, + &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); -DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, + "0x%04llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, + "0x%08llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, + "0x%016llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value @@ -328,7 +605,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x8, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -346,7 +623,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x16, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -364,7 +641,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x32, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -382,7 +659,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x64, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); @@ -398,10 +675,10 @@ static int debugfs_size_t_get(void *data, u64 *val) *val = *(size_t *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, - "%llu\n"); /* %llu and %zu are more or less the same */ -DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, + "%llu\n"); /* %llu and %zu are more or less the same */ +DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value @@ -416,8 +693,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_size_t, - &fops_size_t_ro, &fops_size_t_wo); + return debugfs_create_mode_unsafe(name, mode, parent, value, + &fops_size_t, &fops_size_t_ro, + &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); @@ -431,10 +709,12 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, + "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, + "%lld\n"); /** * debugfs_create_atomic_t - create a debugfs file that is used to read and @@ -450,8 +730,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t, - &fops_atomic_t_ro, &fops_atomic_t_wo); + return debugfs_create_mode_unsafe(name, mode, parent, value, + &fops_atomic_t, &fops_atomic_t_ro, + &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); @@ -459,9 +740,17 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; - bool *val = file->private_data; + bool val; + int r, srcu_idx; - if (*val) + r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!r)) + val = *(bool *)file->private_data; + debugfs_use_file_finish(srcu_idx); + if (r) + return r; + + if (val) buf[0] = 'Y'; else buf[0] = 'N'; @@ -477,6 +766,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size; bool bv; + int r, srcu_idx; bool *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); @@ -484,8 +774,14 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, return -EFAULT; buf[buf_size] = '\0'; - if (strtobool(buf, &bv) == 0) - *val = bv; + if (strtobool(buf, &bv) == 0) { + r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!r)) + *val = bv; + debugfs_use_file_finish(srcu_idx); + if (r) + return r; + } return count; } @@ -537,7 +833,7 @@ static const struct file_operations fops_bool_wo = { struct dentry *debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_bool, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); @@ -546,8 +842,15 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; - return simple_read_from_buffer(user_buf, count, ppos, blob->data, - blob->size); + ssize_t r; + int srcu_idx; + + r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!r)) + r = simple_read_from_buffer(user_buf, count, ppos, blob->data, + blob->size); + debugfs_use_file_finish(srcu_idx); + return r; } static const struct file_operations fops_blob = { @@ -584,7 +887,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { - return debugfs_create_file(name, mode, parent, blob, &fops_blob); + return debugfs_create_file_unsafe(name, mode, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); @@ -689,7 +992,8 @@ struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, data->array = array; data->elements = elements; - return debugfs_create_file(name, mode, parent, data, &u32_array_fops); + return debugfs_create_file_unsafe(name, mode, parent, data, + &u32_array_fops); } EXPORT_SYMBOL_GPL(debugfs_create_u32_array); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index bece948b363d..4bc1f68243c1 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -27,9 +27,14 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/srcu.h> + +#include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 +DEFINE_SRCU(debugfs_srcu); + static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; @@ -39,7 +44,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = + inode->i_ctime = current_fs_time(sb); } return inode; } @@ -294,6 +300,37 @@ static struct dentry *end_creating(struct dentry *dentry) return dentry; } +static struct dentry *__debugfs_create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *proxy_fops, + const struct file_operations *real_fops) +{ + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + BUG_ON(!S_ISREG(mode)); + dentry = start_creating(name, parent); + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = mode; + inode->i_private = data; + + inode->i_fop = proxy_fops; + dentry->d_fsdata = (void *)real_fops; + + d_instantiate(dentry, inode); + fsnotify_create(d_inode(dentry->d_parent), dentry); + return end_creating(dentry); +} + /** * debugfs_create_file - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. @@ -324,29 +361,52 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { - struct dentry *dentry; - struct inode *inode; - if (!(mode & S_IFMT)) - mode |= S_IFREG; - BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); - - if (IS_ERR(dentry)) - return NULL; + return __debugfs_create_file(name, mode, parent, data, + fops ? &debugfs_full_proxy_file_operations : + &debugfs_noop_file_operations, + fops); +} +EXPORT_SYMBOL_GPL(debugfs_create_file); - inode = debugfs_get_inode(dentry->d_sb); - if (unlikely(!inode)) - return failed_creating(dentry); +/** + * debugfs_create_file_unsafe - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * + * debugfs_create_file_unsafe() is completely analogous to + * debugfs_create_file(), the only difference being that the fops + * handed it will not get protected against file removals by the + * debugfs core. + * + * It is your responsibility to protect your struct file_operation + * methods against file removals by means of debugfs_use_file_start() + * and debugfs_use_file_finish(). ->open() is still protected by + * debugfs though. + * + * Any struct file_operations defined by means of + * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and + * thus, may be used here. + */ +struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) +{ - inode->i_mode = mode; - inode->i_fop = fops ? fops : &debugfs_file_operations; - inode->i_private = data; - d_instantiate(dentry, inode); - fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return __debugfs_create_file(name, mode, parent, data, + fops ? &debugfs_open_proxy_file_operations : + &debugfs_noop_file_operations, + fops); } -EXPORT_SYMBOL_GPL(debugfs_create_file); +EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); /** * debugfs_create_file_size - create a file in the debugfs filesystem @@ -457,11 +517,15 @@ struct dentry *debugfs_create_automount(const char *name, if (unlikely(!inode)) return failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; dentry->d_fsdata = (void *)f; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); d_instantiate(dentry, inode); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); @@ -565,6 +629,8 @@ void debugfs_remove(struct dentry *dentry) inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); + + synchronize_srcu(&debugfs_srcu); } EXPORT_SYMBOL_GPL(debugfs_remove); @@ -642,6 +708,8 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); inode_unlock(d_inode(parent)); + + synchronize_srcu(&debugfs_srcu); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h new file mode 100644 index 000000000000..bba52634b995 --- /dev/null +++ b/fs/debugfs/internal.h @@ -0,0 +1,26 @@ +/* + * internal.h - declarations internal to debugfs + * + * Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + */ + +#ifndef _DEBUGFS_INTERNAL_H_ +#define _DEBUGFS_INTERNAL_H_ + +struct file_operations; + +/* declared over in file.c */ +extern const struct file_operations debugfs_noop_file_operations; +extern const struct file_operations debugfs_open_proxy_file_operations; +extern const struct file_operations debugfs_full_proxy_file_operations; + +struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops); + +#endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 655f21f99160..0b2954d7172d 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -128,6 +128,7 @@ static const match_table_t tokens = { struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; + struct super_block *sb; struct dentry *ptmx_dentry; }; @@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = { .show_options = devpts_show_options, }; -static void *new_pts_fs_info(void) +static void *new_pts_fs_info(struct super_block *sb) { struct pts_fs_info *fsi; @@ -369,6 +370,7 @@ static void *new_pts_fs_info(void) ida_init(&fsi->allocated_ptys); fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->sb = sb; return fsi; } @@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; - s->s_fs_info = new_pts_fs_info(); + s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; @@ -524,17 +526,14 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -int devpts_new_index(struct inode *ptmx_inode) +int devpts_new_index(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi; int index; int ida_ret; - if (!sb) + if (!fsi) return -ENODEV; - fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -564,11 +563,8 @@ retry: return index; } -void devpts_kill_index(struct inode *ptmx_inode, int idx) +void devpts_kill_index(struct pts_fs_info *fsi, int idx) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); - mutex_lock(&allocated_ptys_lock); ida_remove(&fsi->allocated_ptys, idx); pty_count--; @@ -578,21 +574,25 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) /* * pty code needs to hold extra references in case of last /dev/tty close */ - -void devpts_add_ref(struct inode *ptmx_inode) +struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; + struct pts_fs_info *fsi; + + sb = pts_sb_from_inode(ptmx_inode); + if (!sb) + return NULL; + fsi = DEVPTS_SB(sb); + if (!fsi) + return NULL; atomic_inc(&sb->s_active); - ihold(ptmx_inode); + return fsi; } -void devpts_del_ref(struct inode *ptmx_inode) +void devpts_put_ref(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - - iput(ptmx_inode); - deactivate_super(sb); + deactivate_super(fsi->sb); } /** @@ -604,22 +604,20 @@ void devpts_del_ref(struct inode *ptmx_inode) * * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, - void *priv) +struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; struct inode *inode; struct dentry *root; - struct pts_fs_info *fsi; struct pts_mount_opts *opts; char s[12]; - if (!sb) + if (!fsi) return ERR_PTR(-ENODEV); + sb = fsi->sb; root = sb->s_root; - fsi = DEVPTS_SB(sb); opts = &fsi->mount_opts; inode = new_inode(sb); @@ -630,25 +628,21 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - init_special_inode(inode, S_IFCHR|opts->mode, device); - inode->i_private = priv; + init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); - inode_lock(d_inode(root)); - dentry = d_alloc_name(root, s); if (dentry) { + dentry->d_fsdata = priv; d_add(dentry, inode); fsnotify_create(d_inode(root), dentry); } else { iput(inode); - inode = ERR_PTR(-ENOMEM); + dentry = ERR_PTR(-ENOMEM); } - inode_unlock(d_inode(root)); - - return inode; + return dentry; } /** @@ -657,24 +651,10 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ -void *devpts_get_priv(struct inode *pts_inode) +void *devpts_get_priv(struct dentry *dentry) { - struct dentry *dentry; - void *priv = NULL; - - BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - - /* Ensure dentry has not been deleted by devpts_pty_kill() */ - dentry = d_find_alias(pts_inode); - if (!dentry) - return NULL; - - if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - priv = pts_inode->i_private; - - dput(dentry); - - return priv; + WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); + return dentry->d_fsdata; } /** @@ -683,24 +663,14 @@ void *devpts_get_priv(struct inode *pts_inode) * * This is an inverse operation of devpts_pty_new. */ -void devpts_pty_kill(struct inode *inode) +void devpts_pty_kill(struct dentry *dentry) { - struct super_block *sb = pts_sb_from_inode(inode); - struct dentry *root = sb->s_root; - struct dentry *dentry; + WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); - BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - - inode_lock(d_inode(root)); - - dentry = d_find_alias(inode); - - drop_nlink(inode); + dentry->d_fsdata = NULL; + drop_nlink(dentry->d_inode); d_delete(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ - dput(dentry); /* d_find_alias above */ - - inode_unlock(d_inode(root)); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 476f1ecbd1f0..3bf3f20f8ecc 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -172,7 +172,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) */ if (dio->page_errors == 0) dio->page_errors = ret; - page_cache_get(page); + get_page(page); dio->pages[0] = page; sdio->head = 0; sdio->tail = 1; @@ -224,9 +224,9 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, - bool is_async) +static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) { + loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; /* @@ -256,6 +256,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, if (dio->end_io) { int err; + // XXX: ki_pos?? err = dio->end_io(dio->iocb, offset, ret, dio->private); if (err) ret = err; @@ -265,15 +266,15 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, inode_dio_end(dio->inode); if (is_async) { - if (dio->rw & WRITE) { - int err; - - err = generic_write_sync(dio->iocb->ki_filp, offset, - transferred); - if (err < 0 && ret > 0) - ret = err; - } + /* + * generic_write_sync expects ki_pos to have been updated + * already, but the submission path only does this for + * synchronous I/O. + */ + dio->iocb->ki_pos += transferred; + if (dio->rw & WRITE) + ret = generic_write_sync(dio->iocb, transferred); dio->iocb->ki_complete(dio->iocb, ret, 0); } @@ -285,7 +286,7 @@ static void dio_aio_complete_work(struct work_struct *work) { struct dio *dio = container_of(work, struct dio, complete_work); - dio_complete(dio, dio->iocb->ki_pos, 0, true); + dio_complete(dio, 0, true); } static int dio_bio_complete(struct dio *dio, struct bio *bio); @@ -314,7 +315,7 @@ static void dio_bio_end_aio(struct bio *bio) queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); } else { - dio_complete(dio, dio->iocb->ki_pos, 0, true); + dio_complete(dio, 0, true); } } } @@ -424,7 +425,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { while (sdio->head < sdio->tail) - page_cache_release(dio->pages[sdio->head++]); + put_page(dio->pages[sdio->head++]); } /* @@ -487,7 +488,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->rw == READ && !PageCompound(page) && dio->should_dirty) set_page_dirty_lock(page); - page_cache_release(page); + put_page(page); } err = bio->bi_error; bio_put(bio); @@ -696,7 +697,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio) */ if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) sdio->pages_in_io--; - page_cache_get(sdio->cur_page); + get_page(sdio->cur_page); sdio->final_block_in_bio = sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits); ret = 0; @@ -810,13 +811,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->cur_page) { ret = dio_send_cur_page(dio, sdio, map_bh); - page_cache_release(sdio->cur_page); + put_page(sdio->cur_page); sdio->cur_page = NULL; if (ret) return ret; } - page_cache_get(page); /* It is in dio */ + get_page(page); /* It is in dio */ sdio->cur_page = page; sdio->cur_page_offset = offset; sdio->cur_page_len = len; @@ -830,7 +831,7 @@ out: if (sdio->boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); dio_bio_submit(dio, sdio); - page_cache_release(sdio->cur_page); + put_page(sdio->cur_page); sdio->cur_page = NULL; } return ret; @@ -947,7 +948,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, ret = get_more_blocks(dio, sdio, map_bh); if (ret) { - page_cache_release(page); + put_page(page); goto out; } if (!buffer_mapped(map_bh)) @@ -988,7 +989,7 @@ do_holes: /* AKPM: eargh, -ENOTBLK is a hack */ if (dio->rw & WRITE) { - page_cache_release(page); + put_page(page); return -ENOTBLK; } @@ -1001,7 +1002,7 @@ do_holes: if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ - page_cache_release(page); + put_page(page); goto out; } zero_user(page, from, 1 << blkbits); @@ -1041,7 +1042,7 @@ do_holes: sdio->next_block_for_io, map_bh); if (ret) { - page_cache_release(page); + put_page(page); goto out; } sdio->next_block_for_io += this_chunk_blocks; @@ -1057,7 +1058,7 @@ next_block: } /* Drop the ref which was taken in get_user_pages() */ - page_cache_release(page); + put_page(page); } out: return ret; @@ -1113,7 +1114,7 @@ static inline int drop_refcount(struct dio *dio) static inline ssize_t do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, - loff_t offset, get_block_t get_block, dio_iodone_t end_io, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); @@ -1121,6 +1122,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; @@ -1281,7 +1283,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, ret2 = dio_send_cur_page(dio, &sdio, &map_bh); if (retval == 0) retval = ret2; - page_cache_release(sdio.cur_page); + put_page(sdio.cur_page); sdio.cur_page = NULL; } if (sdio.bio) @@ -1318,7 +1320,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, offset, retval, false); + retval = dio_complete(dio, retval, false); } else BUG_ON(retval != -EIOCBQUEUED); @@ -1328,7 +1330,7 @@ out: ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, - loff_t offset, get_block_t get_block, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { @@ -1344,7 +1346,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, prefetch(bdev->bd_queue); prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); - return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block, + return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, end_io, submit_io, flags); } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 519112168a9e..1669f6291c95 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -343,13 +343,12 @@ static struct config_group *make_cluster(struct config_group *g, struct dlm_cluster *cl = NULL; struct dlm_spaces *sps = NULL; struct dlm_comms *cms = NULL; - void *gps = NULL; cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); - if (!cl || !gps || !sps || !cms) + if (!cl || !sps || !cms) goto fail; config_group_init_type_name(&cl->group, name, &cluster_type); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 00640e70ed7a..1ab012a27d9f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -640,7 +640,7 @@ static int receive_from_sock(struct connection *con) con->rx_page = alloc_page(GFP_ATOMIC); if (con->rx_page == NULL) goto out_resched; - cbuf_init(&con->cb, PAGE_CACHE_SIZE); + cbuf_init(&con->cb, PAGE_SIZE); } /* @@ -657,7 +657,7 @@ static int receive_from_sock(struct connection *con) * buffer and the start of the currently used section (cb.base) */ if (cbuf_data(&con->cb) >= con->cb.base) { - iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb); + iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb); iov[1].iov_len = con->cb.base; iov[1].iov_base = page_address(con->rx_page); nvec = 2; @@ -675,7 +675,7 @@ static int receive_from_sock(struct connection *con) ret = dlm_process_incoming_buffer(con->nodeid, page_address(con->rx_page), con->cb.base, con->cb.len, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ret == -EBADMSG) { log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d", page_address(con->rx_page), con->cb.base, @@ -1416,7 +1416,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) spin_lock(&con->writequeue_lock); e = list_entry(con->writequeue.prev, struct writequeue_entry, list); if ((&e->list == &con->writequeue) || - (PAGE_CACHE_SIZE - e->end < len)) { + (PAGE_SIZE - e->end < len)) { e = NULL; } else { offset = e->end; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 64026e53722a..ebd40f46ed4c 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -105,19 +105,7 @@ static int ecryptfs_calculate_md5(char *dst, struct crypto_shash *tfm; int rc = 0; - mutex_lock(&crypt_stat->cs_hash_tfm_mutex); tfm = crypt_stat->hash_tfm; - if (!tfm) { - tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0); - if (IS_ERR(tfm)) { - rc = PTR_ERR(tfm); - ecryptfs_printk(KERN_ERR, "Error attempting to " - "allocate crypto context; rc = [%d]\n", - rc); - goto out; - } - crypt_stat->hash_tfm = tfm; - } rc = ecryptfs_hash_digest(tfm, src, len, dst); if (rc) { printk(KERN_ERR @@ -126,7 +114,6 @@ static int ecryptfs_calculate_md5(char *dst, goto out; } out: - mutex_unlock(&crypt_stat->cs_hash_tfm_mutex); return rc; } @@ -207,16 +194,29 @@ out: * * Initialize the crypt_stat structure. */ -void -ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) { + struct crypto_shash *tfm; + int rc; + + tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0); + if (IS_ERR(tfm)) { + rc = PTR_ERR(tfm); + ecryptfs_printk(KERN_ERR, "Error attempting to " + "allocate crypto context; rc = [%d]\n", + rc); + return rc; + } + memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); INIT_LIST_HEAD(&crypt_stat->keysig_list); mutex_init(&crypt_stat->keysig_list_mutex); mutex_init(&crypt_stat->cs_mutex); mutex_init(&crypt_stat->cs_tfm_mutex); - mutex_init(&crypt_stat->cs_hash_tfm_mutex); + crypt_stat->hash_tfm = tfm; crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; + + return 0; } /** @@ -286,7 +286,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, pg = virt_to_page(addr); offset = offset_in_page(addr); sg_set_page(&sg[i], pg, 0, offset); - remainder_of_page = PAGE_CACHE_SIZE - offset; + remainder_of_page = PAGE_SIZE - offset; if (size >= remainder_of_page) { sg[i].length = remainder_of_page; addr += remainder_of_page; @@ -400,7 +400,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, struct page *page) { return ecryptfs_lower_header_size(crypt_stat) + - ((loff_t)page->index << PAGE_CACHE_SHIFT); + ((loff_t)page->index << PAGE_SHIFT); } /** @@ -428,7 +428,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, size_t extent_size = crypt_stat->extent_size; int rc; - extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size)); + extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size)); rc = ecryptfs_derive_iv(extent_iv, crypt_stat, (extent_base + extent_offset)); if (rc) { @@ -498,7 +498,7 @@ int ecryptfs_encrypt_page(struct page *page) } for (extent_offset = 0; - extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); + extent_offset < (PAGE_SIZE / crypt_stat->extent_size); extent_offset++) { rc = crypt_extent(crypt_stat, enc_extent_page, page, extent_offset, ENCRYPT); @@ -512,7 +512,7 @@ int ecryptfs_encrypt_page(struct page *page) lower_offset = lower_offset_for_page(crypt_stat, page); enc_extent_virt = kmap(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); kunmap(enc_extent_page); if (rc < 0) { ecryptfs_printk(KERN_ERR, @@ -560,7 +560,7 @@ int ecryptfs_decrypt_page(struct page *page) lower_offset = lower_offset_for_page(crypt_stat, page); page_virt = kmap(page); - rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE, + rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); kunmap(page); if (rc < 0) { @@ -571,7 +571,7 @@ int ecryptfs_decrypt_page(struct page *page) } for (extent_offset = 0; - extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); + extent_offset < (PAGE_SIZE / crypt_stat->extent_size); extent_offset++) { rc = crypt_extent(crypt_stat, page, page, extent_offset, DECRYPT); @@ -659,11 +659,11 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; else { - if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) + if (PAGE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; else - crypt_stat->metadata_size = PAGE_CACHE_SIZE; + crypt_stat->metadata_size = PAGE_SIZE; } } @@ -1369,7 +1369,9 @@ int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode) ssize_t size; int rc = 0; - size = ecryptfs_getxattr_lower(lower_dentry, ECRYPTFS_XATTR_NAME, + size = ecryptfs_getxattr_lower(lower_dentry, + ecryptfs_inode_to_lower(ecryptfs_inode), + ECRYPTFS_XATTR_NAME, page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE); if (size < 0) { if (unlikely(ecryptfs_verbosity > 0)) @@ -1391,6 +1393,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, int rc; rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), + ecryptfs_inode_to_lower(inode), ECRYPTFS_XATTR_NAME, file_size, ECRYPTFS_SIZE_AND_MARKER_BYTES); if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) @@ -1442,7 +1445,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ECRYPTFS_VALIDATE_HEADER_SIZE); if (rc) { /* metadata is not in the file header, so try xattrs */ - memset(page_virt, 0, PAGE_CACHE_SIZE); + memset(page_virt, 0, PAGE_SIZE); rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " @@ -1475,7 +1478,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) } out: if (page_virt) { - memset(page_virt, 0, PAGE_CACHE_SIZE); + memset(page_virt, 0, PAGE_SIZE); kmem_cache_free(ecryptfs_header_cache, page_virt); } return rc; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index d123fbaa28e0..3ec495db7e82 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -242,7 +242,6 @@ struct ecryptfs_crypt_stat { struct list_head keysig_list; struct mutex keysig_list_mutex; struct mutex cs_tfm_mutex; - struct mutex cs_hash_tfm_mutex; struct mutex cs_mutex; }; @@ -577,7 +576,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_rotate_iv(unsigned char *iv); -void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); @@ -607,8 +606,8 @@ ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, unsigned char *src, struct dentry *ecryptfs_dentry); int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); ssize_t -ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, - void *value, size_t size); +ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode, + const char *name, void *value, size_t size); int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7..7000b96b783e 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -223,14 +222,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -247,6 +238,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -267,6 +297,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -340,26 +383,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif const struct file_operations ecryptfs_dir_fops = { - .iterate = ecryptfs_readdir, + .iterate_shared = ecryptfs_readdir, .read = generic_read_dir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 121114e9a464..318b04689d76 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -324,9 +324,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) /** * ecryptfs_lookup_interpose - Dentry interposition for a lookup */ -static int ecryptfs_lookup_interpose(struct dentry *dentry, - struct dentry *lower_dentry, - struct inode *dir_inode) +static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, + struct dentry *lower_dentry) { struct inode *inode, *lower_inode = d_inode(lower_dentry); struct ecryptfs_dentry_info *dentry_info; @@ -339,11 +338,12 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, "to allocate ecryptfs_dentry_info struct\n", __func__); dput(lower_dentry); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent)); + fsstack_copy_attr_atime(d_inode(dentry->d_parent), + d_inode(lower_dentry->d_parent)); BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); @@ -353,27 +353,25 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, if (d_really_is_negative(lower_dentry)) { /* We want to add because we couldn't find in lower */ d_add(dentry, NULL); - return 0; + return NULL; } - inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb); + inode = __ecryptfs_get_inode(lower_inode, dentry->d_sb); if (IS_ERR(inode)) { printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n", __func__, PTR_ERR(inode)); - return PTR_ERR(inode); + return ERR_CAST(inode); } if (S_ISREG(inode->i_mode)) { rc = ecryptfs_i_size_read(dentry, inode); if (rc) { make_bad_inode(inode); - return rc; + return ERR_PTR(rc); } } if (inode->i_state & I_NEW) unlock_new_inode(inode); - d_add(dentry, inode); - - return rc; + return d_splice_alias(inode, dentry); } /** @@ -390,55 +388,42 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, unsigned int flags) { char *encrypted_and_encoded_name = NULL; - size_t encrypted_and_encoded_name_size; - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct dentry *lower_dir_dentry, *lower_dentry; + const char *name = ecryptfs_dentry->d_name.name; + size_t len = ecryptfs_dentry->d_name.len; + struct dentry *res; int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name, - lower_dir_dentry, - ecryptfs_dentry->d_name.len); - if (IS_ERR(lower_dentry)) { - rc = PTR_ERR(lower_dentry); - ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " - "[%d] on lower_dentry = [%pd]\n", __func__, rc, - ecryptfs_dentry); - goto out; - } - if (d_really_is_positive(lower_dentry)) - goto interpose; + mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (!(mount_crypt_stat - && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) - goto interpose; - dput(lower_dentry); - rc = ecryptfs_encrypt_and_encode_filename( - &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, - mount_crypt_stat, ecryptfs_dentry->d_name.name, - ecryptfs_dentry->d_name.len); - if (rc) { - printk(KERN_ERR "%s: Error attempting to encrypt and encode " - "filename; rc = [%d]\n", __func__, rc); - goto out; + if (mount_crypt_stat + && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { + rc = ecryptfs_encrypt_and_encode_filename( + &encrypted_and_encoded_name, &len, + mount_crypt_stat, name, len); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt and encode " + "filename; rc = [%d]\n", __func__, rc); + return ERR_PTR(rc); + } + name = encrypted_and_encoded_name; } - lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name, - lower_dir_dentry, - encrypted_and_encoded_name_size); + + lower_dentry = lookup_one_len_unlocked(name, lower_dir_dentry, len); if (IS_ERR(lower_dentry)) { - rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " - "[%d] on lower_dentry = [%s]\n", __func__, rc, - encrypted_and_encoded_name); - goto out; + "[%ld] on lower_dentry = [%s]\n", __func__, + PTR_ERR(lower_dentry), + name); + res = ERR_CAST(lower_dentry); + } else { + res = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry); } -interpose: - rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, - ecryptfs_dir_inode); -out: kfree(encrypted_and_encoded_name); - return ERR_PTR(rc); + return res; } static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, @@ -763,10 +748,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, } else { /* ia->ia_size < i_size_read(inode) */ /* We're chopping off all the pages down to the page * in which ia->ia_size is located. Fill in the end of - * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to - * PAGE_CACHE_SIZE with zeros. */ - size_t num_zeros = (PAGE_CACHE_SIZE - - (ia->ia_size & ~PAGE_CACHE_MASK)); + * that page from (ia->ia_size & ~PAGE_MASK) to + * PAGE_SIZE with zeros. */ + size_t num_zeros = (PAGE_SIZE + - (ia->ia_size & ~PAGE_MASK)); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { truncate_setsize(inode, ia->ia_size); @@ -898,8 +883,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) - ecryptfs_init_crypt_stat(crypt_stat); + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) { + rc = ecryptfs_init_crypt_stat(crypt_stat); + if (rc) + return rc; + } inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); @@ -1033,29 +1021,30 @@ out: } ssize_t -ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, - void *value, size_t size) +ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode, + const char *name, void *value, size_t size) { int rc = 0; - if (!d_inode(lower_dentry)->i_op->getxattr) { + if (!lower_inode->i_op->getxattr) { rc = -EOPNOTSUPP; goto out; } - inode_lock(d_inode(lower_dentry)); - rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, - size); - inode_unlock(d_inode(lower_dentry)); + inode_lock(lower_inode); + rc = lower_inode->i_op->getxattr(lower_dentry, lower_inode, + name, value, size); + inode_unlock(lower_inode); out: return rc; } static ssize_t -ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value, - size_t size) +ecryptfs_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) { - return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), name, - value, size); + return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), + ecryptfs_inode_to_lower(inode), + name, value, size); } static ssize_t diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 9893d1538122..3cf1546dca82 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1798,7 +1798,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, * added the our &auth_tok_list */ next_packet_is_auth_tok_packet = 1; while (next_packet_is_auth_tok_packet) { - size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i); + size_t max_packet_size = ((PAGE_SIZE - 8) - i); switch (src[i]) { case ECRYPTFS_TAG_3_PACKET_TYPE: diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 8b0b4a73116d..1698132d0e57 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -695,12 +695,12 @@ static struct ecryptfs_cache_info { { .cache = &ecryptfs_header_cache, .name = "ecryptfs_headers", - .size = PAGE_CACHE_SIZE, + .size = PAGE_SIZE, }, { .cache = &ecryptfs_xattr_cache, .name = "ecryptfs_xattr_cache", - .size = PAGE_CACHE_SIZE, + .size = PAGE_SIZE, }, { .cache = &ecryptfs_key_record_cache, @@ -818,7 +818,7 @@ static int __init ecryptfs_init(void) { int rc; - if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) { + if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " "larger than the host's page size, and so " @@ -826,7 +826,7 @@ static int __init ecryptfs_init(void) "default eCryptfs extent size is [%u] bytes; " "the page size is [%lu] bytes.\n", ECRYPTFS_DEFAULT_EXTENT_SIZE, - (unsigned long)PAGE_CACHE_SIZE); + (unsigned long)PAGE_SIZE); goto out; } rc = ecryptfs_init_kmem_caches(); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 1f5865263b3e..148d11b514fb 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -122,7 +122,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, struct ecryptfs_crypt_stat *crypt_stat) { loff_t extent_num_in_page = 0; - loff_t num_extents_per_page = (PAGE_CACHE_SIZE + loff_t num_extents_per_page = (PAGE_SIZE / crypt_stat->extent_size); int rc = 0; @@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, char *page_virt; page_virt = kmap_atomic(page); - memset(page_virt, 0, PAGE_CACHE_SIZE); + memset(page_virt, 0, PAGE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { size_t written; @@ -164,8 +164,8 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, - crypt_stat->metadata_size); rc = ecryptfs_read_lower_page_segment( - page, (lower_offset >> PAGE_CACHE_SHIFT), - (lower_offset & ~PAGE_CACHE_MASK), + page, (lower_offset >> PAGE_SHIFT), + (lower_offset & ~PAGE_MASK), crypt_stat->extent_size, page->mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " @@ -198,7 +198,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page) if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment(page, page->index, 0, - PAGE_CACHE_SIZE, + PAGE_SIZE, page->mapping->host); } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { @@ -215,7 +215,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page) } else { rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_CACHE_SIZE, + page, page->index, 0, PAGE_SIZE, page->mapping->host); if (rc) { printk(KERN_ERR "Error reading page; rc = " @@ -250,12 +250,12 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) struct inode *inode = page->mapping->host; int end_byte_in_page; - if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index) + if ((i_size_read(inode) / PAGE_SIZE) != page->index) goto out; - end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; + end_byte_in_page = i_size_read(inode) % PAGE_SIZE; if (to > end_byte_in_page) end_byte_in_page = to; - zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE); + zero_user_segment(page, end_byte_in_page, PAGE_SIZE); out: return 0; } @@ -279,7 +279,7 @@ static int ecryptfs_write_begin(struct file *file, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; struct page *page; loff_t prev_page_end_size; int rc = 0; @@ -289,14 +289,14 @@ static int ecryptfs_write_begin(struct file *file, return -ENOMEM; *pagep = page; - prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); + prev_page_end_size = ((loff_t)index << PAGE_SHIFT); if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(mapping->host)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_CACHE_SIZE, mapping->host); + page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", @@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file, SetPageUptodate(page); } else { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_CACHE_SIZE, + page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " @@ -336,9 +336,9 @@ static int ecryptfs_write_begin(struct file *file, } else { if (prev_page_end_size >= i_size_read(page->mapping->host)) { - zero_user(page, 0, PAGE_CACHE_SIZE); + zero_user(page, 0, PAGE_SIZE); SetPageUptodate(page); - } else if (len < PAGE_CACHE_SIZE) { + } else if (len < PAGE_SIZE) { rc = ecryptfs_decrypt_page(page); if (rc) { printk(KERN_ERR "%s: Error decrypting " @@ -371,11 +371,11 @@ static int ecryptfs_write_begin(struct file *file, * of page? Zero it out. */ if ((i_size_read(mapping->host) == prev_page_end_size) && (pos != 0)) - zero_user(page, 0, PAGE_CACHE_SIZE); + zero_user(page, 0, PAGE_SIZE); out: if (unlikely(rc)) { unlock_page(page); - page_cache_release(page); + put_page(page); *pagep = NULL; } return rc; @@ -436,8 +436,9 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) goto out; } inode_lock(lower_inode); - size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, - xattr_virt, PAGE_CACHE_SIZE); + size = lower_inode->i_op->getxattr(lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, + xattr_virt, PAGE_SIZE); if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); @@ -479,8 +480,8 @@ static int ecryptfs_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + pgoff_t index = pos >> PAGE_SHIFT; + unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + copied; struct inode *ecryptfs_inode = mapping->host; struct ecryptfs_crypt_stat *crypt_stat = @@ -500,7 +501,7 @@ static int ecryptfs_write_end(struct file *file, goto out; } if (!PageUptodate(page)) { - if (copied < PAGE_CACHE_SIZE) { + if (copied < PAGE_SIZE) { rc = 0; goto out; } @@ -533,7 +534,7 @@ static int ecryptfs_write_end(struct file *file, rc = copied; out: unlock_page(page); - page_cache_release(page); + put_page(page); return rc; } diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 09fe622274e4..158a3a39f82d 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -74,7 +74,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, loff_t offset; int rc; - offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT) + offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) + offset_in_page); virt = kmap(page_for_lower); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); @@ -123,9 +123,9 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, else pos = offset; while (pos < (offset + size)) { - pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); - size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); - size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); + pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT); + size_t start_offset_in_page = (pos & ~PAGE_MASK); + size_t num_bytes = (PAGE_SIZE - start_offset_in_page); loff_t total_remaining_bytes = ((offset + size) - pos); if (fatal_signal_pending(current)) { @@ -165,7 +165,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, * Fill in zero values to the end of the page */ memset(((char *)ecryptfs_page_virt + start_offset_in_page), 0, - PAGE_CACHE_SIZE - start_offset_in_page); + PAGE_SIZE - start_offset_in_page); } /* pos >= offset, we are now writing the data request */ @@ -186,7 +186,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, ecryptfs_page, start_offset_in_page, data_offset); - page_cache_release(ecryptfs_page); + put_page(ecryptfs_page); if (rc) { printk(KERN_ERR "%s: Error encrypting " "page; rc = [%d]\n", __func__, rc); @@ -262,7 +262,7 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, loff_t offset; int rc; - offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page); + offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); virt = kmap(page_for_ecryptfs); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 77a486d3a51b..85411ceb0508 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -55,7 +55,10 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL); if (unlikely(!inode_info)) goto out; - ecryptfs_init_crypt_stat(&inode_info->crypt_stat); + if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) { + kmem_cache_free(ecryptfs_inode_info_cache, inode_info); + goto out; + } mutex_init(&inode_info->lower_file_mutex); atomic_set(&inode_info->lower_file_count, 0); inode_info->lower_file = NULL; diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index d48e0d261d78..5f22e74bbade 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -157,7 +157,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg) return 0; } -long +static long efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p) { void __user *arg = (void __user *)p; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index e2ab6d0497f2..1d73fc6dba13 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/uuid.h> #include "internal.h" @@ -46,11 +47,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, */ bool efivarfs_valid_name(const char *str, int len) { - static const char dashes[EFI_VARIABLE_GUID_LEN] = { - [8] = 1, [13] = 1, [18] = 1, [23] = 1 - }; const char *s = str + len - EFI_VARIABLE_GUID_LEN; - int i; /* * We need a GUID, plus at least one letter for the variable name, @@ -68,37 +65,7 @@ bool efivarfs_valid_name(const char *str, int len) * * 12345678-1234-1234-1234-123456789abc */ - for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) { - if (dashes[i]) { - if (*s++ != '-') - return false; - } else { - if (!isxdigit(*s++)) - return false; - } - } - - return true; -} - -static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) -{ - guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]); - guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]); - guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]); - guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]); - guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]); - guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]); - guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]); - guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]); - guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]); - guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]); - guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]); - guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]); - guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]); - guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]); - guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]); - guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]); + return uuid_is_valid(s); } static int efivarfs_create(struct inode *dir, struct dentry *dentry, @@ -119,8 +86,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, /* length of the variable name itself: remove GUID and separator */ namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; - efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, - &var->var.VendorGuid); + uuid_le_to_bin(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); if (efivar_variable_is_removable(var->var.VendorGuid, dentry->d_name.name, namelen)) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index dd029d13ea61..9cb54a38832d 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -197,8 +197,8 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) efivarfs_sb = sb; sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = EFIVARFS_MAGIC; sb->s_op = &efivarfs_ops; sb->s_d_op = &efivarfs_d_ops; @@ -216,8 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&efivarfs_list); - err = efivar_init(efivarfs_callback, (void *)sb, false, - true, &efivarfs_list); + err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list); if (err) __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); diff --git a/fs/efs/dir.c b/fs/efs/dir.c index ce63b24f7c3e..a7be96e5f1cb 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -12,7 +12,7 @@ static int efs_readdir(struct file *, struct dir_context *); const struct file_operations efs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = efs_readdir, + .iterate_shared = efs_readdir, }; const struct inode_operations efs_dir_inode_operations = { @@ -100,4 +100,3 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot; return 0; } - diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 40ba9cc41bf7..d34a40edcdb2 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -113,7 +113,7 @@ struct dentry *efs_get_parent(struct dentry *child) ino = efs_find_entry(d_inode(child), "..", 2); if (ino) - parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino)); + parent = d_obtain_alias(efs_iget(child->d_sb, ino)); return parent; } diff --git a/fs/efs/super.c b/fs/efs/super.c index cb68dac4f9d3..368f7dd21c61 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -275,7 +275,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!bh) { pr_err("cannot read volume header\n"); - return -EINVAL; + return -EIO; } /* @@ -293,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { pr_err("cannot read superblock\n"); - return -EINVAL; + return -EIO; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a74a2a52e0f..10db91218933 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep, return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); } -static inline struct timespec ep_set_mstimeout(long ms) +static inline struct timespec64 ep_set_mstimeout(long ms) { - struct timespec now, ts = { + struct timespec64 now, ts = { .tv_sec = ms / MSEC_PER_SEC, .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), }; - ktime_get_ts(&now); - return timespec_add_safe(now, ts); + ktime_get_ts64(&now); + return timespec64_add_safe(now, ts); } /** @@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, ktime_t expires, *to = NULL; if (timeout > 0) { - struct timespec end_time = ep_set_mstimeout(timeout); + struct timespec64 end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time); to = &expires; - *to = timespec_to_ktime(end_time); + *to = timespec64_to_ktime(end_time); } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the diff --git a/fs/exec.c b/fs/exec.c index c4010b8207a1..887c1c955df8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -243,10 +243,6 @@ static void put_arg_page(struct page *page) put_page(page); } -static void free_arg_page(struct linux_binprm *bprm, int i) -{ -} - static void free_arg_pages(struct linux_binprm *bprm) { } @@ -267,7 +263,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm) if (!vma) return -ENOMEM; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) { + err = -EINTR; + goto err_free; + } vma->vm_mm = mm; /* @@ -294,6 +293,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) return 0; err: up_write(&mm->mmap_sem); +err_free: bprm->vma = NULL; kmem_cache_free(vm_area_cachep, vma); return err; @@ -700,7 +700,9 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->loader -= stack_shift; bprm->exec -= stack_shift; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + vm_flags = VM_STACK_FLAGS; /* @@ -850,15 +852,25 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, if (ret) return ret; + ret = deny_write_access(file); + if (ret) + return ret; + i_size = i_size_read(file_inode(file)); - if (max_size > 0 && i_size > max_size) - return -EFBIG; - if (i_size <= 0) - return -EINVAL; + if (max_size > 0 && i_size > max_size) { + ret = -EFBIG; + goto out; + } + if (i_size <= 0) { + ret = -EINVAL; + goto out; + } *buf = vmalloc(i_size); - if (!*buf) - return -ENOMEM; + if (!*buf) { + ret = -ENOMEM; + goto out; + } pos = 0; while (pos < i_size) { @@ -876,18 +888,21 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, if (pos != i_size) { ret = -EIO; - goto out; + goto out_free; } ret = security_kernel_post_read_file(file, *buf, i_size, id); if (!ret) *size = pos; -out: +out_free: if (ret < 0) { vfree(*buf); *buf = NULL; } + +out: + allow_write_access(file); return ret; } EXPORT_SYMBOL_GPL(kernel_read_file); @@ -1387,7 +1402,12 @@ static void bprm_fill_uid(struct linux_binprm *bprm) kuid_t uid; kgid_t gid; - /* clear any previous set[ug]id data from a previous binary */ + /* + * Since this can be called multiple times (via prepare_binprm), + * we must clear any previous work done when setting set[ug]id + * bits from any earlier bprm->file uses (for example when run + * first for a setuid script then again for its interpreter). + */ bprm->cred->euid = current_euid(); bprm->cred->egid = current_egid(); @@ -1481,9 +1501,6 @@ int remove_arg_zero(struct linux_binprm *bprm) kunmap_atomic(kaddr); put_arg_page(page); - - if (offset == PAGE_SIZE) - free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1); } while (offset == PAGE_SIZE); bprm->p++; diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index e5bb2abf77f9..f69a1b5826a5 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -41,16 +41,16 @@ static inline unsigned exofs_chunk_size(struct inode *inode) static inline void exofs_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) { loff_t last_byte = inode->i_size; - last_byte -= page_nr << PAGE_CACHE_SHIFT; - if (last_byte > PAGE_CACHE_SIZE) - last_byte = PAGE_CACHE_SIZE; + last_byte -= page_nr << PAGE_SHIFT; + if (last_byte > PAGE_SIZE) + last_byte = PAGE_SIZE; return last_byte; } @@ -79,19 +79,19 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) return err; } -static void exofs_check_page(struct page *page) +static bool exofs_check_page(struct page *page) { struct inode *dir = page->mapping->host; unsigned chunk_size = exofs_chunk_size(dir); char *kaddr = page_address(page); unsigned offs, rec_len; - unsigned limit = PAGE_CACHE_SIZE; + unsigned limit = PAGE_SIZE; struct exofs_dir_entry *p; char *error; /* if the page is the last one in the directory */ - if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_CACHE_MASK; + if ((dir->i_size >> PAGE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_MASK; if (limit & (chunk_size - 1)) goto Ebadsize; if (!limit) @@ -114,7 +114,7 @@ static void exofs_check_page(struct page *page) goto Eend; out: SetPageChecked(page); - return; + return true; Ebadsize: EXOFS_ERR("ERROR [exofs_check_page]: " @@ -138,7 +138,7 @@ bad_entry: EXOFS_ERR( "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", - dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, _LLU(le64_to_cpu(p->inode_no)), rec_len, p->name_len); goto fail; @@ -147,11 +147,11 @@ Eend: EXOFS_ERR("ERROR [exofs_check_page]: " "entry in directory(0x%lx) spans the page boundary" "offset=%lu, inode=0x%llx\n", - dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + dir->i_ino, (page->index<<PAGE_SHIFT)+offs, _LLU(le64_to_cpu(p->inode_no))); fail: - SetPageChecked(page); SetPageError(page); + return false; } static struct page *exofs_get_page(struct inode *dir, unsigned long n) @@ -161,10 +161,10 @@ static struct page *exofs_get_page(struct inode *dir, unsigned long n) if (!IS_ERR(page)) { kmap(page); - if (!PageChecked(page)) - exofs_check_page(page); - if (PageError(page)) - goto fail; + if (unlikely(!PageChecked(page))) { + if (PageError(page) || !exofs_check_page(page)) + goto fail; + } } return page; @@ -237,8 +237,8 @@ exofs_readdir(struct file *file, struct dir_context *ctx) { loff_t pos = ctx->pos; struct inode *inode = file_inode(file); - unsigned int offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned int offset = pos & ~PAGE_MASK; + unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); int need_revalidate = (file->f_version != inode->i_version); @@ -254,7 +254,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx) if (IS_ERR(page)) { EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", inode->i_ino); - ctx->pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); @@ -262,7 +262,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx) if (offset) { offset = exofs_validate_entry(kaddr, offset, chunk_mask); - ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_SHIFT) + offset; } file->f_version = inode->i_version; need_revalidate = 0; @@ -449,7 +449,7 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode) kaddr = page_address(page); dir_end = kaddr + exofs_last_byte(dir, n); de = (struct exofs_dir_entry *)kaddr; - kaddr += PAGE_CACHE_SIZE - reclen; + kaddr += PAGE_SIZE - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { name_len = 0; @@ -602,7 +602,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) kunmap_atomic(kaddr); err = exofs_commit_chunk(page, 0, chunk_size); fail: - page_cache_release(page); + put_page(page); return err; } @@ -657,5 +657,5 @@ not_empty: const struct file_operations exofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = exofs_readdir, + .iterate_shared = exofs_readdir, }; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 9eaf595aeaf8..9dc4c6dbf3c9 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -317,7 +317,7 @@ static int read_exec(struct page_collect *pcol) if (!pcol->ios) { int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, - pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->pg_first << PAGE_SHIFT, pcol->length, &pcol->ios); if (ret) @@ -383,7 +383,7 @@ static int readpage_strip(void *data, struct page *page) struct inode *inode = pcol->inode; struct exofs_i_info *oi = exofs_i(inode); loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + pgoff_t end_index = i_size >> PAGE_SHIFT; size_t len; int ret; @@ -397,9 +397,9 @@ static int readpage_strip(void *data, struct page *page) pcol->that_locked_page = page; if (page->index < end_index) - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; else if (page->index == end_index) - len = i_size & ~PAGE_CACHE_MASK; + len = i_size & ~PAGE_MASK; else len = 0; @@ -442,8 +442,8 @@ try_again: goto fail; } - if (len != PAGE_CACHE_SIZE) - zero_user(page, len, PAGE_CACHE_SIZE - len); + if (len != PAGE_SIZE) + zero_user(page, len, PAGE_SIZE - len); EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", inode->i_ino, page->index, len); @@ -609,7 +609,7 @@ static void __r4w_put_page(void *priv, struct page *page) if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { EXOFS_DBGMSG2("index=0x%lx\n", page->index); - page_cache_release(page); + put_page(page); return; } EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", @@ -633,7 +633,7 @@ static int write_exec(struct page_collect *pcol) BUG_ON(pcol->ios); ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, - pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->pg_first << PAGE_SHIFT, pcol->length, &pcol->ios); if (unlikely(ret)) goto err; @@ -696,7 +696,7 @@ static int writepage_strip(struct page *page, struct inode *inode = pcol->inode; struct exofs_i_info *oi = exofs_i(inode); loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + pgoff_t end_index = i_size >> PAGE_SHIFT; size_t len; int ret; @@ -708,9 +708,9 @@ static int writepage_strip(struct page *page, if (page->index < end_index) /* in this case, the page is within the limits of the file */ - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; else { - len = i_size & ~PAGE_CACHE_MASK; + len = i_size & ~PAGE_MASK; if (page->index > end_index || !len) { /* in this case, the page is outside the limits @@ -790,10 +790,10 @@ static int exofs_writepages(struct address_space *mapping, long start, end, expected_pages; int ret; - start = wbc->range_start >> PAGE_CACHE_SHIFT; + start = wbc->range_start >> PAGE_SHIFT; end = (wbc->range_end == LLONG_MAX) ? start + mapping->nrpages : - wbc->range_end >> PAGE_CACHE_SHIFT; + wbc->range_end >> PAGE_SHIFT; if (start || end) expected_pages = end - start + 1; @@ -881,15 +881,15 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, } /* read modify write */ - if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + if (!PageUptodate(page) && (len != PAGE_SIZE)) { loff_t i_size = i_size_read(mapping->host); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + pgoff_t end_index = i_size >> PAGE_SHIFT; size_t rlen; if (page->index < end_index) - rlen = PAGE_CACHE_SIZE; + rlen = PAGE_SIZE; else if (page->index == end_index) - rlen = i_size & ~PAGE_CACHE_MASK; + rlen = i_size & ~PAGE_MASK; else rlen = 0; @@ -960,8 +960,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, /* TODO: Should be easy enough to do proprly */ -static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { return 0; } diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index c20d77df2679..622a686bb08b 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -292,11 +292,11 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, out_dir: if (dir_de) { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } out_old: kunmap(old_page); - page_cache_release(old_page); + put_page(old_page); out: return err; } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 6658a50530a0..1076a4233b39 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -122,7 +122,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts) if (match_int(&args[0], &option)) return -EINVAL; if (option <= 0) { - EXOFS_ERR("Timout must be > 0"); + EXOFS_ERR("Timeout must be > 0"); return -EINVAL; } opts->timeout = option * HZ; @@ -958,7 +958,7 @@ static struct dentry *exofs_get_parent(struct dentry *child) if (!ino) return ERR_PTR(-ESTALE); - return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino)); + return d_obtain_alias(exofs_iget(child->d_sb, ino)); } static struct inode *exofs_nfs_get_inode(struct super_block *sb, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index c46f1a190b8d..207ba8d627ca 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -143,14 +143,18 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - inode_lock(parent->d_inode); - tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); - inode_unlock(parent->d_inode); + tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); goto out_err; } if (tmp != dentry) { + /* + * Somebody has renamed it since exportfs_get_name(); + * great, since it could've only been renamed if it + * got looked up and thus connected, and it would + * remain connected afterwards. We are done. + */ dput(tmp); goto out_reconnected; } @@ -308,7 +312,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child) goto out; error = -EINVAL; - if (!file->f_op->iterate) + if (!file->f_op->iterate && !file->f_op->iterate_shared) goto out_close; buffer.sequence = 0; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 27695e6f4e46..42f1d1814083 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -172,9 +172,6 @@ ext2_get_acl(struct inode *inode, int type) acl = ERR_PTR(retval); kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - return acl; } diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 0c6638b40f21..19efd1197fa5 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -37,7 +37,7 @@ static inline unsigned ext2_rec_len_from_disk(__le16 dlen) { unsigned len = le16_to_cpu(dlen); -#if (PAGE_CACHE_SIZE >= 65536) +#if (PAGE_SIZE >= 65536) if (len == EXT2_MAX_REC_LEN) return 1 << 16; #endif @@ -46,7 +46,7 @@ static inline unsigned ext2_rec_len_from_disk(__le16 dlen) static inline __le16 ext2_rec_len_to_disk(unsigned len) { -#if (PAGE_CACHE_SIZE >= 65536) +#if (PAGE_SIZE >= 65536) if (len == (1 << 16)) return cpu_to_le16(EXT2_MAX_REC_LEN); else @@ -67,7 +67,7 @@ static inline unsigned ext2_chunk_size(struct inode *inode) static inline void ext2_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } /* @@ -79,9 +79,9 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr) { unsigned last_byte = inode->i_size; - last_byte -= page_nr << PAGE_CACHE_SHIFT; - if (last_byte > PAGE_CACHE_SIZE) - last_byte = PAGE_CACHE_SIZE; + last_byte -= page_nr << PAGE_SHIFT; + if (last_byte > PAGE_SIZE) + last_byte = PAGE_SIZE; return last_byte; } @@ -110,7 +110,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) return err; } -static void ext2_check_page(struct page *page, int quiet) +static bool ext2_check_page(struct page *page, int quiet) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; @@ -118,12 +118,12 @@ static void ext2_check_page(struct page *page, int quiet) char *kaddr = page_address(page); u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count); unsigned offs, rec_len; - unsigned limit = PAGE_CACHE_SIZE; + unsigned limit = PAGE_SIZE; ext2_dirent *p; char *error; - if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_CACHE_MASK; + if ((dir->i_size >> PAGE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_MASK; if (limit & (chunk_size - 1)) goto Ebadsize; if (!limit) @@ -148,7 +148,7 @@ static void ext2_check_page(struct page *page, int quiet) goto Eend; out: SetPageChecked(page); - return; + return true; /* Too bad, we had an error */ @@ -176,7 +176,7 @@ bad_entry: if (!quiet) ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, (unsigned long) le32_to_cpu(p->inode), rec_len, p->name_len); goto fail; @@ -186,12 +186,12 @@ Eend: ext2_error(sb, "ext2_check_page", "entry in directory #%lu spans the page boundary" "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + dir->i_ino, (page->index<<PAGE_SHIFT)+offs, (unsigned long) le32_to_cpu(p->inode)); } fail: - SetPageChecked(page); SetPageError(page); + return false; } static struct page * ext2_get_page(struct inode *dir, unsigned long n, @@ -201,10 +201,10 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n, struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) { kmap(page); - if (!PageChecked(page)) - ext2_check_page(page, quiet); - if (PageError(page)) - goto fail; + if (unlikely(!PageChecked(page))) { + if (PageError(page) || !ext2_check_page(page, quiet)) + goto fail; + } } return page; @@ -287,8 +287,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx) loff_t pos = ctx->pos; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - unsigned int offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned int offset = pos & ~PAGE_MASK; + unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; @@ -309,14 +309,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx) ext2_error(sb, __func__, "bad page in #%lu", inode->i_ino); - ctx->pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ext2_validate_entry(kaddr, offset, chunk_mask); - ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_SHIFT) + offset; } file->f_version = inode->i_version; need_revalidate = 0; @@ -406,7 +406,7 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, if (++n >= npages) n = 0; /* next page is past the blocks we've got */ - if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { ext2_error(dir->i_sb, __func__, "dir %lu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, @@ -511,7 +511,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) kaddr = page_address(page); dir_end = kaddr + ext2_last_byte(dir, n); de = (ext2_dirent *)kaddr; - kaddr += PAGE_CACHE_SIZE - reclen; + kaddr += PAGE_SIZE - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -655,7 +655,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) kunmap_atomic(kaddr); err = ext2_commit_chunk(page, 0, chunk_size); fail: - page_cache_release(page); + put_page(page); return err; } @@ -716,7 +716,7 @@ not_empty: const struct file_operations ext2_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = ext2_readdir, + .iterate_shared = ext2_readdir, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c1400b109805..868c02317b05 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = __dax_fault(vma, vmf, ext2_get_block, NULL); + ret = __dax_fault(vma, vmf, ext2_get_block); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, } down_read(&ei->dax_sem); - ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); up_read(&ei->dax_sem); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6bd58e6ff038..fcbe58641e40 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -26,6 +26,7 @@ #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> +#include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/writeback.h> #include <linux/buffer_head.h> @@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_sectors(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) << - (inode->i_blkbits - 9), - 1 << inode->i_blkbits); + err = sb_issue_zeroout(inode->i_sb, + le32_to_cpu(chain[depth-1].key), count, + GFP_NOFS); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } + } else + set_buffer_new(bh_result); ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); - set_buffer_new(bh_result); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -854,20 +854,20 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) } static ssize_t -ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL, + ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, DIO_LOCKING); else - ret = blockdev_direct_IO(iocb, inode, iter, offset, - ext2_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); return ret; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 7a2be8f7f3c3..d446203127fc 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -82,7 +82,7 @@ struct dentry *ext2_get_parent(struct dentry *child) unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino)); + return d_obtain_alias(ext2_iget(child->d_sb, ino)); } /* @@ -398,7 +398,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); else { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } inode_dec_link_count(old_dir); } @@ -408,11 +408,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, out_dir: if (dir_de) { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } out_old: kunmap(old_page); - page_cache_release(old_page); + put_page(old_page); out: return err; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b78caf25f746..1d9379568aa8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext2_msg(sb, KERN_ERR, - "error: device does not support dax"); - goto failed_mount; - } } /* If the blocksize doesn't match, re-read the thing.. */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index ba97f243b050..7fd3b867ce65 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -9,10 +9,10 @@ static int ext2_xattr_security_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 2c94d1930626..0f85705ff519 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -16,10 +16,10 @@ ext2_xattr_trusted_list(struct dentry *dentry) static int ext2_xattr_trusted_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 72a2a96d677f..1fafd27037cc 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -18,12 +18,12 @@ ext2_xattr_user_list(struct dentry *dentry) static int ext2_xattr_user_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER, + return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size); } diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 69b1e73026a5..c6601a476c02 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -172,9 +172,6 @@ ext4_get_acl(struct inode *inode, int type) acl = ERR_PTR(retval); kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - return acl; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fe1f50fe764f..3020fd70c392 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -610,7 +610,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + return 1; } /* diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index edc053a81914..6a6c27373b54 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -32,6 +32,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <linux/spinlock_types.h> +#include <linux/namei.h> #include "ext4_extents.h" #include "xattr.h" @@ -91,7 +92,8 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, + gfp_t gfp_flags) { struct ext4_crypto_ctx *ctx = NULL; int res = 0; @@ -118,7 +120,7 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); if (!ctx) { - ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, gfp_flags); if (!ctx) { res = -ENOMEM; goto out; @@ -255,7 +257,8 @@ static int ext4_page_crypto(struct inode *inode, ext4_direction_t rw, pgoff_t index, struct page *src_page, - struct page *dest_page) + struct page *dest_page, + gfp_t gfp_flags) { u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; @@ -266,7 +269,7 @@ static int ext4_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - req = skcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", @@ -283,10 +286,10 @@ static int ext4_page_crypto(struct inode *inode, EXT4_XTS_TWEAK_SIZE - sizeof(index)); sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, xts_tweak); if (rw == EXT4_DECRYPT) res = crypto_skcipher_decrypt(req); @@ -307,9 +310,10 @@ static int ext4_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx, + gfp_t gfp_flags) { - ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT); + ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); ctx->flags |= EXT4_WRITE_PATH_FL; @@ -332,7 +336,8 @@ static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) * error value or NULL. */ struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page) + struct page *plaintext_page, + gfp_t gfp_flags) { struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; @@ -340,17 +345,17 @@ struct page *ext4_encrypt(struct inode *inode, BUG_ON(!PageLocked(plaintext_page)); - ctx = ext4_get_crypto_ctx(inode); + ctx = ext4_get_crypto_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *) ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); + plaintext_page, ciphertext_page, gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); errout: @@ -378,8 +383,8 @@ int ext4_decrypt(struct page *page) { BUG_ON(!PageLocked(page)); - return ext4_page_crypto(page->mapping->host, - EXT4_DECRYPT, page->index, page, page); + return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT, + page->index, page, page, GFP_NOFS); } int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, @@ -396,13 +401,13 @@ int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, (unsigned long) inode->i_ino, lblk, len); #endif - BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - ctx = ext4_get_crypto_ctx(inode); + ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); if (IS_ERR(ciphertext_page)) { err = PTR_ERR(ciphertext_page); goto errout; @@ -410,11 +415,12 @@ int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, while (len--) { err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page); + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); if (err) goto errout; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_NOWAIT, 1); if (!bio) { err = -ENOMEM; goto errout; @@ -473,13 +479,19 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) */ static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct inode *dir = d_inode(dentry->d_parent); - struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info; + struct dentry *dir; + struct ext4_crypt_info *ci; int dir_has_key, cached_with_key; - if (!ext4_encrypted_inode(dir)) - return 0; + if (flags & LOOKUP_RCU) + return -ECHILD; + dir = dget_parent(dentry); + if (!ext4_encrypted_inode(d_inode(dir))) { + dput(dir); + return 0; + } + ci = EXT4_I(d_inode(dir))->i_crypt_info; if (ci && ci->ci_keyring_key && (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | @@ -489,6 +501,7 @@ static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) /* this should eventually be an flag in d_flags */ cached_with_key = dentry->d_fsdata != NULL; dir_has_key = (ci != NULL); + dput(dir); /* * If the dentry was cached without the key, and it is a diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 50ba27cbed03..68323e3da3fa 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -150,18 +150,23 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err > 0) { pgoff_t index = map.m_pblk >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); + (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, &file->f_ra, file, index, 1); - file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { err = PTR_ERR(bh); @@ -266,7 +271,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } - if ((ctx->pos < inode->i_size) && !dir_relax(inode)) + if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) goto done; brelse(bh); bh = NULL; @@ -644,7 +649,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, - .iterate = ext4_readdir, + .iterate_shared = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c04743519865..b84aa1ca480a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -33,6 +33,7 @@ #include <linux/ratelimit.h> #include <crypto/hash.h> #include <linux/falloc.h> +#include <linux/percpu-rwsem.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -581,6 +582,9 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 /* * The bit position of these flags must not overlap with any of the @@ -912,6 +916,29 @@ do { \ #include "extents_status.h" /* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* * fourth extended file system inode data in memory */ struct ext4_inode_info { @@ -1482,6 +1509,9 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + + /* Barrier between changing inodes' journal flags and writepages ops. */ + struct percpu_rw_semaphore s_journal_flag_rwsem; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1526,7 +1556,6 @@ enum { EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ - EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; @@ -1961,7 +1990,7 @@ ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); -#if (PAGE_CACHE_SIZE >= 65536) +#if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); @@ -1974,7 +2003,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) BUG(); -#if (PAGE_CACHE_SIZE >= 65536) +#if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { @@ -2282,11 +2311,13 @@ extern struct kmem_cache *ext4_crypt_info_cachep; bool ext4_valid_contents_enc_mode(uint32_t mode); uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); extern struct workqueue_struct *ext4_read_workqueue; -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode); +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, + gfp_t gfp_flags); void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page); + struct page *plaintext_page, + gfp_t gfp_flags); int ext4_decrypt(struct page *page); int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); @@ -2496,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, @@ -2556,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); @@ -3305,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } +static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) +{ + int blksize = 1 << inode->i_blkbits; + + return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5f5846211095..09c1ef38cbe6 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -359,10 +359,21 @@ static inline int ext4_journal_force_commit(journal_t *journal) return 0; } -static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +static inline int ext4_jbd2_inode_add_write(handle_t *handle, + struct inode *inode) { if (ext4_handle_valid(handle)) - return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); + return jbd2_journal_inode_add_write(handle, + EXT4_I(inode)->jinode); + return 0; +} + +static inline int ext4_jbd2_inode_add_wait(handle_t *handle, + struct inode *inode) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_inode_add_wait(handle, + EXT4_I(inode)->jinode); return 0; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 95bf4679ac54..2a2eef9c14e4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, if (!ext4_handle_valid(handle)) return 0; - if (handle->h_buffer_credits > needed) + if (handle->h_buffer_credits >= needed) return 0; - err = ext4_journal_extend(handle, needed); + /* + * If we need to extend the journal get a few extra blocks + * while we're at it for efficiency's sake. + */ + needed += 3; + err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); if (err <= 0) return err; err = ext4_truncate_restart_trans(handle, inode, needed); @@ -907,13 +912,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_block_hdr(bh); ppos++; - if (unlikely(ppos > depth)) { - put_bh(bh); - EXT4_ERROR_INODE(inode, - "ppos %d > depth %d", ppos, depth); - ret = -EFSCORRUPTED; - goto err; - } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } @@ -2583,7 +2581,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, } } else ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u\n", + "%u-%u from %u:%u", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } @@ -3738,7 +3736,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef EXT4_DEBUG ext4_warning("Inode (%ld) finished: extent logical block %llu," - " len %u; IO logical block %llu, len %u\n", + " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e38b987ac7f5..37e059202cd2 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, (status & EXTENT_STATUS_WRITTEN)) { ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " " delayed and written which can potentially " - " cause data loss.\n", lblk, len); + " cause data loss.", lblk, len); WARN_ON(1); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6659e216385e..df44c877892a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -169,13 +169,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(file, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); if (o_direct) blk_finish_plug(&plug); @@ -207,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); + result = __dax_fault(vma, vmf, ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -243,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_mmap_get_block, NULL); + ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -329,7 +324,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct vfsmount *mnt = filp->f_path.mnt; - struct inode *dir = filp->f_path.dentry->d_parent->d_inode; + struct dentry *dir; struct path path; char buf[64], *cp; int ret; @@ -373,14 +368,18 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ext4_encryption_info(inode) == NULL) return -ENOKEY; } - if (ext4_encrypted_inode(dir) && - !ext4_is_child_context_consistent_with_parent(dir, inode)) { + + dir = dget_parent(file_dentry(filp)); + if (ext4_encrypted_inode(d_inode(dir)) && + !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) { ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu\n", - (unsigned long) dir->i_ino, + "Inconsistent encryption contexts: %lu/%lu", + (unsigned long) d_inode(dir)->i_ino, (unsigned long) inode->i_ino); + dput(dir); return -EPERM; } + dput(dir); /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -428,8 +427,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, lastoff = startoff; endoff = (loff_t)end_blk << blkbits; - index = startoff >> PAGE_CACHE_SHIFT; - end = endoff >> PAGE_CACHE_SHIFT; + index = startoff >> PAGE_SHIFT; + end = endoff >> PAGE_SHIFT; pagevec_init(&pvec, 0); do { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 237b877d316d..3da4cf8d18b6 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1150,25 +1150,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; - struct buffer_head *bitmap_bh; + struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; - long err = -EIO; + int err = -EFSCORRUPTED; - /* Error cases - e2fsck has already cleaned up for us */ - if (ino > max_ino) { - ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); - err = -EFSCORRUPTED; - goto error; - } + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - ext4_warning(sb, "inode bitmap error %ld for orphan %lu", - ino, err); - goto error; + ext4_error(sb, "inode bitmap error %ld for orphan %lu", + ino, PTR_ERR(bitmap_bh)); + return (struct inode *) bitmap_bh; } /* Having the inode bit set should be a 100% indicator that this @@ -1179,15 +1174,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) goto bad_orphan; inode = ext4_iget(sb, ino); - if (IS_ERR(inode)) - goto iget_failed; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(sb, "couldn't read orphan inode %lu (err %d)", + ino, err); + return inode; + } /* - * If the orphans has i_nlinks > 0 then it should be able to be - * truncated, otherwise it won't be removed from the orphan list - * during processing and an infinite loop will result. + * If the orphans has i_nlinks > 0 then it should be able to + * be truncated, otherwise it won't be removed from the orphan + * list during processing and an infinite loop will result. + * Similarly, it must not be a bad inode. */ - if (inode->i_nlink && !ext4_can_truncate(inode)) + if ((inode->i_nlink && !ext4_can_truncate(inode)) || + is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) @@ -1195,29 +1196,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) brelse(bitmap_bh); return inode; -iget_failed: - err = PTR_ERR(inode); - inode = NULL; bad_orphan: - ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", - bit, (unsigned long long)bitmap_bh->b_blocknr, - ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_WARNING "inode=%p\n", inode); + ext4_error(sb, "bad orphan inode %lu", ino); + if (bitmap_bh) + printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { - printk(KERN_WARNING "is_bad_inode(inode)=%d\n", + printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_WARNING "max_ino=%lu\n", max_ino); - printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); + printk(KERN_ERR "max_ino=%lu\n", max_ino); + printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); -error: return ERR_PTR(err); } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3027fa681de5..bc15c2c17633 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -649,133 +649,6 @@ out: } /* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_iter_count(iter); - int retries = 0; - - if (iov_iter_rw(iter) == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) { - /* - * Nolock dioread optimization may be dynamically disabled - * via ext4_inode_block_unlocked_dio(). Check inode's state - * while holding extra i_dio_count ref. - */ - inode_dio_begin(inode); - smp_mb(); - if (unlikely(ext4_test_inode_state(inode, - EXT4_STATE_DIOREAD_LOCK))) { - inode_dio_end(inode); - goto locked; - } - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, - ext4_dio_get_block, NULL, 0); - else - ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, - offset, ext4_dio_get_block, - NULL, NULL, 0); - inode_dio_end(inode); - } else { -locked: - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, - ext4_dio_get_block, NULL, DIO_LOCKING); - else - ret = blockdev_direct_IO(iocb, inode, iter, offset, - ext4_dio_get_block); - - if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -/* * Calculate the number of metadata blocks need to reserve * to allocate a new block at @lblocks for non extent file based file */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7cbdd3752ba5..ff7538c26992 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -482,7 +482,7 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page) ret = ext4_read_inline_data(inode, kaddr, len, &iloc); flush_dcache_page(page); kunmap_atomic(kaddr); - zero_user_segment(page, len, PAGE_CACHE_SIZE); + zero_user_segment(page, len, PAGE_SIZE); SetPageUptodate(page); brelse(iloc.bh); @@ -507,7 +507,7 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) if (!page->index) ret = ext4_read_inline_page(inode, page); else if (!PageUptodate(page)) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } @@ -595,7 +595,7 @@ retry: if (ret) { unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; ext4_orphan_add(handle, inode); up_write(&EXT4_I(inode)->xattr_sem); @@ -621,7 +621,7 @@ retry: out: if (page) { unlock_page(page); - page_cache_release(page); + put_page(page); } if (sem_held) up_write(&EXT4_I(inode)->xattr_sem); @@ -690,7 +690,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (!ext4_has_inline_data(inode)) { ret = 0; unlock_page(page); - page_cache_release(page); + put_page(page); goto out_up_read; } @@ -815,7 +815,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, if (ret) { up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); - page_cache_release(page); + put_page(page); ext4_truncate_failed_write(inode); return ret; } @@ -829,7 +829,7 @@ out: up_read(&EXT4_I(inode)->xattr_sem); if (page) { unlock_page(page); - page_cache_release(page); + put_page(page); } return ret; } @@ -919,7 +919,7 @@ retry_journal: out_release_page: up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); - page_cache_release(page); + put_page(page); out_journal: ext4_journal_stop(handle); out: @@ -947,7 +947,7 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, i_size_changed = 1; } unlock_page(page); - page_cache_release(page); + put_page(page); /* * Don't mark the inode dirty under page lock. First, it unnecessarily @@ -1780,7 +1780,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" - "inline size %d\n", + "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dab84a2530ff..f7140ca66e3b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -684,6 +684,24 @@ out_sem: ret = check_block_validity(inode, map); if (ret != 0) return ret; + + /* + * Inodes with freshly allocated blocks where contents will be + * visible after transaction commit must be on transaction's + * ordered data list. + */ + if (map->m_flags & EXT4_MAP_NEW && + !(map->m_flags & EXT4_MAP_UNWRITTEN) && + !(flags & EXT4_GET_BLOCKS_ZERO) && + !IS_NOQUOTA(inode) && + ext4_should_order_data(inode)) { + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + ret = ext4_jbd2_inode_add_wait(handle, inode); + else + ret = ext4_jbd2_inode_add_write(handle, inode); + if (ret) + return ret; + } } return retval; } @@ -763,39 +781,47 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 -static handle_t *start_dio_trans(struct inode *inode, - struct buffer_head *bh_result) +/* + * Get blocks function for the cases that need to start a transaction - + * generally difference cases of direct IO and DAX IO. It also handles retries + * in case of ENOSPC. + */ +static int ext4_get_block_trans(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int flags) { int dio_credits; + handle_t *handle; + int retries = 0; + int ret; /* Trim mapping request to maximum we can map at once for DIO */ if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; dio_credits = ext4_chunk_trans_blocks(inode, bh_result->b_size >> inode->i_blkbits); - return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); +retry: + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = _ext4_get_block(inode, iblock, bh_result, flags); + ext4_journal_stop(handle); + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return ret; } /* Get block function for DIO reads and writes to inodes without extents */ int ext4_dio_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - handle_t *handle; - int ret; - /* We don't expect handle for direct IO */ WARN_ON_ONCE(ext4_journal_current_handle()); - if (create) { - handle = start_dio_trans(inode, bh); - if (IS_ERR(handle)) - return PTR_ERR(handle); - } - ret = _ext4_get_block(inode, iblock, bh, - create ? EXT4_GET_BLOCKS_CREATE : 0); - if (create) - ext4_journal_stop(handle); - return ret; + if (!create) + return _ext4_get_block(inode, iblock, bh, 0); + return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); } /* @@ -806,18 +832,13 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock, static int ext4_dio_get_block_unwritten_async(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle; int ret; /* We don't expect handle for direct IO */ WARN_ON_ONCE(ext4_journal_current_handle()); - handle = start_dio_trans(inode, bh_result); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - ext4_journal_stop(handle); + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); /* * When doing DIO using unwritten extents, we need io_end to convert @@ -850,18 +871,13 @@ static int ext4_dio_get_block_unwritten_async(struct inode *inode, static int ext4_dio_get_block_unwritten_sync(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle; int ret; /* We don't expect handle for direct IO */ WARN_ON_ONCE(ext4_journal_current_handle()); - handle = start_dio_trans(inode, bh_result); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - ext4_journal_stop(handle); + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); /* * Mark inode as having pending DIO writes to unwritten extents. @@ -1057,7 +1073,7 @@ int do_journal_get_write_access(handle_t *handle, static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; struct inode *inode = page->mapping->host; unsigned block_start, block_end; @@ -1069,15 +1085,15 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, bool decrypt = false; BUG_ON(!PageLocked(page)); - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > PAGE_SIZE); + BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); bbits = ilog2(blocksize); - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + block = (sector_t)page->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { @@ -1159,8 +1175,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; - index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); + index = pos >> PAGE_SHIFT; + from = pos & (PAGE_SIZE - 1); to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -1188,7 +1204,7 @@ retry_grab: retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { - page_cache_release(page); + put_page(page); return PTR_ERR(handle); } @@ -1196,7 +1212,7 @@ retry_journal: if (page->mapping != mapping) { /* The page got truncated from under us */ unlock_page(page); - page_cache_release(page); + put_page(page); ext4_journal_stop(handle); goto retry_grab; } @@ -1252,7 +1268,7 @@ retry_journal: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; - page_cache_release(page); + put_page(page); return ret; } *pagep = page; @@ -1291,15 +1307,6 @@ static int ext4_write_end(struct file *file, int i_size_changed = 0; trace_ext4_write_end(inode, pos, len, copied); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { - ret = ext4_jbd2_file_inode(handle, inode); - if (ret) { - unlock_page(page); - page_cache_release(page); - goto errout; - } - } - if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); @@ -1315,7 +1322,7 @@ static int ext4_write_end(struct file *file, */ i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); - page_cache_release(page); + put_page(page); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -1399,7 +1406,7 @@ static int ext4_journalled_write_end(struct file *file, int size_changed = 0; trace_ext4_journalled_write_end(inode, pos, len, copied); - from = pos & (PAGE_CACHE_SIZE - 1); + from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); @@ -1423,7 +1430,7 @@ static int ext4_journalled_write_end(struct file *file, ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; unlock_page(page); - page_cache_release(page); + put_page(page); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -1537,7 +1544,7 @@ static void ext4_da_page_release_reservation(struct page *page, int num_clusters; ext4_fsblk_t lblk; - BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + BUG_ON(stop > PAGE_SIZE || stop < length); head = page_buffers(page); bh = head; @@ -1553,7 +1560,7 @@ static void ext4_da_page_release_reservation(struct page *page, clear_buffer_delay(bh); } else if (contiguous_blks) { lblk = page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); + (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; ext4_es_remove_extent(inode, lblk, contiguous_blks); @@ -1563,7 +1570,7 @@ static void ext4_da_page_release_reservation(struct page *page, } while ((bh = bh->b_this_page) != head); if (contiguous_blks) { - lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; ext4_es_remove_extent(inode, lblk, contiguous_blks); } @@ -1572,7 +1579,7 @@ static void ext4_da_page_release_reservation(struct page *page, * need to release the reserved space for that cluster. */ num_clusters = EXT4_NUM_B2C(sbi, to_release); while (num_clusters > 0) { - lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + + lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || !ext4_find_delalloc_cluster(inode, lblk)) @@ -1619,8 +1626,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; - start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + start = index << (PAGE_SHIFT - inode->i_blkbits); + last = end << (PAGE_SHIFT - inode->i_blkbits); ext4_es_remove_extent(inode, start, last - start + 1); } @@ -1636,7 +1643,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { - block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + block_invalidatepage(page, 0, PAGE_SIZE); ClearPageUptodate(page); } unlock_page(page); @@ -2007,10 +2014,10 @@ static int ext4_writepage(struct page *page, trace_ext4_writepage(page); size = i_size_read(inode); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; else - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; page_bufs = page_buffers(page); /* @@ -2034,7 +2041,7 @@ static int ext4_writepage(struct page *page, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); if ((current->flags & PF_MEMALLOC) || - (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) { + (inode->i_sb->s_blocksize == PAGE_SIZE)) { /* * For memory cleaning there's no point in writing only * some buffers. So just bail out. Warn if we came here @@ -2076,10 +2083,10 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) int err; BUG_ON(page->index != mpd->first_page); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; else - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; clear_page_dirty_for_io(page); err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) @@ -2213,7 +2220,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) int nr_pages, i; struct inode *inode = mpd->inode; struct buffer_head *head, *bh; - int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; + int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; sector_t pblock; @@ -2274,7 +2281,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) * supports blocksize < pagesize as we will try to * convert potentially unmapped parts of inode. */ - mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; + mpd->io_submit.io_end->size += PAGE_SIZE; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); if (err < 0) { @@ -2315,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_METADATA_NOFAIL; + EXT4_GET_BLOCKS_METADATA_NOFAIL | + EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -2426,7 +2434,7 @@ update_disksize: * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ - disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; + disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; loff_t i_size; @@ -2562,7 +2570,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->next_page = page->index + 1; /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)page->index) << - (PAGE_CACHE_SHIFT - blkbits); + (PAGE_SHIFT - blkbits); head = page_buffers(page); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) @@ -2604,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; + percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - if (dax_mapping(mapping)) - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, - wbc); + if (dax_mapping(mapping)) { + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + goto out_writepages; + } /* * No pages to write? This is mainly a kludge to avoid starting @@ -2647,7 +2658,7 @@ static int ext4_writepages(struct address_space *mapping, * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ - rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits); } /* @@ -2678,8 +2689,8 @@ static int ext4_writepages(struct address_space *mapping, mpd.first_page = writeback_index; mpd.last_page = -1; } else { - mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; - mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; + mpd.first_page = wbc->range_start >> PAGE_SHIFT; + mpd.last_page = wbc->range_end >> PAGE_SHIFT; } mpd.inode = inode; @@ -2778,6 +2789,7 @@ retry: out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); return ret; } @@ -2838,7 +2850,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; handle_t *handle; - index = pos >> PAGE_CACHE_SHIFT; + index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; @@ -2881,7 +2893,7 @@ retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_da_write_credits(inode, pos, len)); if (IS_ERR(handle)) { - page_cache_release(page); + put_page(page); return PTR_ERR(handle); } @@ -2889,7 +2901,7 @@ retry_journal: if (page->mapping != mapping) { /* The page got truncated from under us */ unlock_page(page); - page_cache_release(page); + put_page(page); ext4_journal_stop(handle); goto retry_grab; } @@ -2917,7 +2929,7 @@ retry_journal: ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; - page_cache_release(page); + put_page(page); return ret; } @@ -2965,7 +2977,7 @@ static int ext4_da_write_end(struct file *file, len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); - start = pos & (PAGE_CACHE_SIZE - 1); + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; /* @@ -3187,7 +3199,7 @@ static int __ext4_journalled_invalidatepage(struct page *page, /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0 && length == PAGE_CACHE_SIZE) + if (offset == 0 && length == PAGE_SIZE) ClearPageChecked(page); return jbd2_journal_invalidatepage(journal, page, offset, length); @@ -3217,75 +3229,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +/* + * Get block function for DAX IO and mmap faults. It takes care of converting + * unwritten extents to written ones and initializes new / converted blocks + * to zeros. + */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { - int ret, err; - int credits; - struct ext4_map_blocks map; - handle_t *handle = NULL; - int flags = 0; + int ret; - ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", - inode->i_ino, create); - map.m_lblk = iblock; - map.m_len = bh_result->b_size >> inode->i_blkbits; - credits = ext4_chunk_trans_blocks(inode, map.m_len); - if (create) { - flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - } + ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); + if (!create) + return _ext4_get_block(inode, iblock, bh_result, 0); - ret = ext4_map_blocks(handle, inode, &map, flags); - if (create) { - err = ext4_journal_stop(handle); - if (ret >= 0 && err < 0) - ret = err; - } - if (ret <= 0) - goto out; - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int err2; + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) + return ret; + if (buffer_unwritten(bh_result)) { /* - * We are protected by i_mmap_sem so we know block cannot go - * away from under us even though we dropped i_data_sem. - * Convert extent to written and write zeros there. - * - * Note: We may get here even when create == 0. + * We are protected by i_mmap_sem or i_mutex so we know block + * cannot go away from under us even though we dropped + * i_data_sem. Convert extent to written and write zeros there. */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - err = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); - if (err < 0) - ret = err; - err2 = ext4_journal_stop(handle); - if (err2 < 0 && ret > 0) - ret = err2; - } -out: - WARN_ON_ONCE(ret == 0 && create); - if (ret > 0) { - map_bh(bh_result, inode->i_sb, map.m_pblk); - /* - * At least for now we have to clear BH_New so that DAX code - * doesn't attempt to zero blocks again in a racy way. - */ - map.m_flags &= ~EXT4_MAP_NEW; - ext4_update_bh_state(bh_result, map.m_flags); - bh_result->b_size = map.m_len << inode->i_blkbits; - ret = 0; + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_CONVERT | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) + return ret; } - return ret; + /* + * At least for now we have to clear BH_New so that DAX code + * doesn't attempt to zero blocks again in a racy way. + */ + clear_buffer_new(bh_result); + return 0; +} +#else +/* Just define empty function, it will never get called. */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + BUG(); + return 0; } #endif @@ -3318,7 +3307,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, } /* - * For ext4 extent files, ext4 will do direct-io write to holes, + * Handling of direct IO writes. + * + * For ext4 extent files, ext4 will do direct-io write even to holes, * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * @@ -3336,21 +3327,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * if the machine crashes during the write. * */ -static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); ssize_t ret; + loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(iter); int overwrite = 0; get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + int orphan = 0; + handle_t *handle; - /* Use the old path for reads and writes beyond i_size. */ - if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(iocb, iter, offset); + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } BUG_ON(iocb->private == NULL); @@ -3359,8 +3366,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * conversion. This also disallows race between truncate() and * overwrite DIO as i_dio_count needs to be incremented under i_mutex. */ - if (iov_iter_rw(iter) == WRITE) - inode_dio_begin(inode); + inode_dio_begin(inode); /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); @@ -3369,7 +3375,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, inode_unlock(inode); /* - * We could direct write to holes and fallocate. + * For extent mapped files we could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as unwritten to prevent * parallel buffered read to expose the stale data before DIO complete @@ -3391,7 +3397,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (is_sync_kiocb(iocb)) { + else if (IS_DAX(inode)) { + /* + * We can avoid zeroing for aligned DAX writes beyond EOF. Other + * writes need zeroing either because they can race with page + * faults or because they use partial blocks. + */ + if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && + ext4_aligned_io(inode, offset, count)) + get_block_func = ext4_dio_get_block; + else + get_block_func = ext4_dax_get_block; + dio_flags = DIO_LOCKING; + } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + get_block_func = ext4_dio_get_block; + dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; + } else if (is_sync_kiocb(iocb)) { get_block_func = ext4_dio_get_block_unwritten_sync; dio_flags = DIO_LOCKING; } else { @@ -3401,12 +3423,12 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, #ifdef CONFIG_EXT4_FS_ENCRYPTION BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, get_block_func, + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, iter, get_block_func, ext4_end_io_dio, dio_flags); - else + } else ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, offset, + inode->i_sb->s_bdev, iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); @@ -3424,21 +3446,95 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } - if (iov_iter_rw(iter) == WRITE) - inode_dio_end(inode); + inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) inode_lock(inode); + if (ret < 0 && final_size > inode->i_size) + ext4_truncate_failed_write(inode); + + /* Handle extending of i_size after direct IO write */ + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) +{ + int unlocked = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; + ssize_t ret; + + if (ext4_should_dioread_nolock(inode)) { + /* + * Nolock dioread optimization may be dynamically disabled + * via ext4_inode_block_unlocked_dio(). Check inode's state + * while holding extra i_dio_count ref. + */ + inode_dio_begin(inode); + smp_mb(); + if (unlikely(ext4_test_inode_state(inode, + EXT4_STATE_DIOREAD_LOCK))) + inode_dio_end(inode); + else + unlocked = 1; + } + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, + NULL, unlocked ? 0 : DIO_LOCKING); + } else { + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, ext4_dio_get_block, + NULL, NULL, + unlocked ? 0 : DIO_LOCKING); + } + if (unlocked) + inode_dio_end(inode); return ret; } -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -3457,10 +3553,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(iocb, iter, offset); + if (iov_iter_rw(iter) == READ) + ret = ext4_direct_IO_read(iocb, iter); else - ret = ext4_ind_direct_IO(iocb, iter, offset); + ret = ext4_direct_IO_write(iocb, iter); trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); return ret; } @@ -3536,10 +3632,7 @@ void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: - ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); - break; case EXT4_INODE_WRITEBACK_DATA_MODE: - ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; @@ -3556,8 +3649,8 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + ext4_fsblk_t index = from >> PAGE_SHIFT; + unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; @@ -3565,14 +3658,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, struct page *page; int err = 0; - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + page = find_or_create_page(mapping, from >> PAGE_SHIFT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) return -ENOMEM; blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -3614,7 +3707,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, ext4_encrypted_inode(inode)) { /* We expect the key to be set. */ BUG_ON(!ext4_has_encryption_key(inode)); - BUG_ON(blocksize != PAGE_CACHE_SIZE); + BUG_ON(blocksize != PAGE_SIZE); WARN_ON_ONCE(ext4_decrypt(page)); } } @@ -3632,13 +3725,13 @@ static int __ext4_block_zero_page_range(handle_t *handle, } else { err = 0; mark_buffer_dirty(bh); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) - err = ext4_jbd2_file_inode(handle, inode); + if (ext4_should_order_data(inode)) + err = ext4_jbd2_inode_add_write(handle, inode); } unlock: unlock_page(page); - page_cache_release(page); + put_page(page); return err; } @@ -3653,7 +3746,7 @@ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); @@ -3678,7 +3771,7 @@ static int ext4_block_zero_page_range(handle_t *handle, static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; @@ -3816,7 +3909,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) */ if (offset + length > inode->i_size) { length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - offset; } @@ -4891,23 +4984,23 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) tid_t commit_tid = 0; int ret; - offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + offset = inode->i_size & (PAGE_SIZE - 1); /* * All buffers in the last page remain valid? Then there's nothing to - * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == + * do. We do the check mainly to optimize the common PAGE_SIZE == * blocksize case */ - if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) + if (offset > PAGE_SIZE - (1 << inode->i_blkbits)) return; while (1) { page = find_lock_page(inode->i_mapping, - inode->i_size >> PAGE_CACHE_SHIFT); + inode->i_size >> PAGE_SHIFT); if (!page) return; ret = __ext4_journalled_invalidatepage(page, offset, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); unlock_page(page); - page_cache_release(page); + put_page(page); if (ret != -EBUSY) return; commit_tid = 0; @@ -5431,6 +5524,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); /* * We have to be very careful here: changing a data block's @@ -5447,22 +5541,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return 0; if (is_journal_aborted(journal)) return -EROFS; - /* We have to allocate physical blocks for delalloc blocks - * before flushing journal. otherwise delalloc blocks can not - * be allocated any more. even more truncate on delalloc blocks - * could trigger BUG by flushing delalloc blocks in journal. - * There is no delalloc block in non-journal data mode. - */ - if (val && test_opt(inode->i_sb, DELALLOC)) { - err = ext4_alloc_da_blocks(inode); - if (err < 0) - return err; - } /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Before flushing the journal and switching inode's aops, we have + * to flush all dirty data the inode has. There can be outstanding + * delayed allocations, there can be unwritten extents created by + * fallocate or buffered writes in dioread_nolock mode covered by + * dirty data which can be converted only after flushing the dirty + * data (and journalled aops don't know how to handle these cases). + */ + if (val) { + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); + return err; + } + } + + percpu_down_write(&sbi->s_journal_flag_rwsem); jbd2_journal_lock_updates(journal); /* @@ -5479,6 +5581,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); ext4_inode_resume_unlocked_dio(inode); return err; } @@ -5487,6 +5590,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); + + if (val) + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ @@ -5546,10 +5653,10 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; else - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time @@ -5580,7 +5687,7 @@ retry_alloc: ret = block_page_mkwrite(vma, vmf, get_block); if (!ret && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { + PAGE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; ext4_journal_stop(handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index eae5917c534e..28cc412852af 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -13,8 +13,8 @@ #include <linux/compat.h> #include <linux/mount.h> #include <linux/file.h> -#include <linux/random.h> #include <linux/quotaops.h> +#include <linux/uuid.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -365,7 +365,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) struct dquot *transfer_to[MAXQUOTAS] = { }; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (transfer_to[PRJQUOTA]) { + if (!IS_ERR(transfer_to[PRJQUOTA])) { err = __dquot_transfer(inode, transfer_to); dqput(transfer_to[PRJQUOTA]); if (err) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 50e05df28f66..c1ab3ec30423 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -119,7 +119,7 @@ MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); * * * one block each for bitmap and buddy information. So for each group we - * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / + * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / * blocksize) blocks. So it can have information regarding groups_per_page * which is blocks_per_page/2 * @@ -807,7 +807,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A page can - * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. + * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 * @@ -839,7 +839,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = 1 << inode->i_blkbits; - blocks_per_page = PAGE_CACHE_SIZE / blocksize; + blocks_per_page = PAGE_SIZE / blocksize; groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) @@ -993,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. @@ -1028,11 +1028,11 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) { unlock_page(e4b->bd_bitmap_page); - page_cache_release(e4b->bd_bitmap_page); + put_page(e4b->bd_bitmap_page); } if (e4b->bd_buddy_page) { unlock_page(e4b->bd_buddy_page); - page_cache_release(e4b->bd_buddy_page); + put_page(e4b->bd_buddy_page); } } @@ -1125,7 +1125,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, might_sleep(); mb_debug(1, "load group %u\n", group); - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; @@ -1167,7 +1167,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, * is yet to initialize the same. So * wait for it to initialize. */ - page_cache_release(page); + put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); @@ -1203,7 +1203,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) - page_cache_release(page); + put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); @@ -1238,11 +1238,11 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, err: if (page) - page_cache_release(page); + put_page(page); if (e4b->bd_bitmap_page) - page_cache_release(e4b->bd_bitmap_page); + put_page(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) - page_cache_release(e4b->bd_buddy_page); + put_page(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; @@ -1257,15 +1257,16 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) - page_cache_release(e4b->bd_bitmap_page); + put_page(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) - page_cache_release(e4b->bd_buddy_page); + put_page(e4b->bd_buddy_page); } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1; + int bb_incr = 1 << (e4b->bd_blkbits - 1); void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); @@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) /* this block is part of buddy of order 'order' */ return order; } - bb += 1 << (e4b->bd_blkbits - order); + bb += bb_incr; + bb_incr >>= 1; order++; } return 0; @@ -2583,7 +2585,7 @@ int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; - unsigned offset; + unsigned offset, offset_incr; unsigned max; int ret; @@ -2612,11 +2614,13 @@ int ext4_mb_init(struct super_block *sb) i = 1; offset = 0; + offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; - offset += 1 << (sb->s_blocksize_bits - i); + offset += offset_incr; + offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i <= sb->s_blocksize_bits + 1); @@ -2833,8 +2837,8 @@ static void ext4_free_data_callback(struct super_block *sb, /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); + put_page(e4b.bd_buddy_page); + put_page(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->efd_group); kmem_cache_free(ext4_free_data_cachep, entry); @@ -4385,9 +4389,9 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) - page_cache_release(ac->ac_bitmap_page); + put_page(ac->ac_bitmap_page); if (ac->ac_buddy_page) - page_cache_release(ac->ac_buddy_page); + put_page(ac->ac_buddy_page); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); @@ -4599,8 +4603,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); + get_page(e4b->bd_buddy_page); + get_page(e4b->bd_bitmap_page); } while (*n) { parent = *n; @@ -4935,7 +4939,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * boundary. */ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - ext4_warning(sb, "too much blocks added to group %u\n", + ext4_warning(sb, "too much blocks added to group %u", block_group); err = -EINVAL; goto error_return; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 24445275d330..23d436d6f8b8 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, "MMP failure info: last update time: %llu, last update " - "node: %s, last update device: %s\n", + "node: %s, last update device: %s", (long long unsigned int) le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, mmp->mmp_bdevname); } @@ -353,7 +353,7 @@ skip: * wait for MMP interval and check mmp_seq. */ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { - ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + ext4_warning(sb, "MMP startup interrupted, failing mount"); goto failed; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 4098acc701c3..a920c5d29fac 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -60,10 +60,10 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); } else { down_write(&EXT4_I(second)->i_data_sem); - down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); + down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); } } @@ -156,7 +156,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, page[1] = grab_cache_page_write_begin(mapping[1], index2, fl); if (!page[1]) { unlock_page(page[0]); - page_cache_release(page[0]); + put_page(page[0]); return -ENOMEM; } /* @@ -192,7 +192,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; @@ -268,7 +268,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; - int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; @@ -400,13 +400,13 @@ data_copy: /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ - *err = ext4_jbd2_file_inode(handle, orig_inode); + *err = ext4_jbd2_inode_add_write(handle, orig_inode); unlock_pages: unlock_page(pagep[0]); - page_cache_release(pagep[0]); + put_page(pagep[0]); unlock_page(pagep[1]); - page_cache_release(pagep[1]); + put_page(pagep[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && @@ -484,6 +484,13 @@ mext_check_arguments(struct inode *orig_inode, return -EBUSY; } + if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be quota files [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EBUSY; + } + /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " @@ -554,7 +561,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *path = NULL; - int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; ext4_lblk_t o_end, o_start = orig_blk; ext4_lblk_t d_start = donor_blk; int ret; @@ -648,9 +655,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, if (o_end - o_start < cur_len) cur_len = o_end - o_start; - orig_page_index = o_start >> (PAGE_CACHE_SHIFT - + orig_page_index = o_start >> (PAGE_SHIFT - orig_inode->i_blkbits); - donor_page_index = d_start >> (PAGE_CACHE_SHIFT - + donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; if (cur_len > blocks_per_page- offset_in_page) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 48e4b8907826..ec4c39952e84 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1107,6 +1107,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } while (1) { + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); @@ -1613,7 +1618,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (nokey) return ERR_PTR(-ENOKEY); ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu\n", + "Inconsistent encryption contexts: %lu/%lu", (unsigned long) dir->i_ino, (unsigned long) inode->i_ino); return ERR_PTR(-EPERM); @@ -1638,13 +1643,13 @@ struct dentry *ext4_get_parent(struct dentry *child) ino = le32_to_cpu(de->inode); brelse(bh); - if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) { + if (!ext4_valid_inum(child->d_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } - return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino)); + return d_obtain_alias(ext4_iget_normal(child->d_sb, ino)); } /* @@ -2828,7 +2833,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); - list_del(&EXT4_I(inode)->i_orphan); + list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d77d15f4b674..2a01df9cc1c3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -341,9 +342,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE; - bio_get(io->io_bio); submit_bio(io_op, io->io_bio); - bio_put(io->io_bio); } io->io_bio = NULL; } @@ -432,8 +431,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - if (len < PAGE_CACHE_SIZE) - zero_user_segment(page, len, PAGE_CACHE_SIZE); + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); /* * In the first loop we prepare and mark buffers to submit. We have to * mark all buffers in the page before submitting so that @@ -470,9 +469,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && nr_to_submit) { - data_page = ext4_encrypt(inode, page); + gfp_t gfp_flags = GFP_NOFS; + + retry_encrypt: + data_page = ext4_encrypt(inode, page, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); + if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { + if (io->io_bio) { + ext4_io_submit(io); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } data_page = NULL; goto out; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5dc5e95063de..dc54a4b60eba 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -23,7 +23,7 @@ * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: - * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * the end-of-file on blocksize < PAGE_SIZE setups. * */ @@ -140,7 +140,7 @@ int ext4_mpage_readpages(struct address_space *mapping, struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; @@ -173,7 +173,7 @@ int ext4_mpage_readpages(struct address_space *mapping, if (page_has_buffers(page)) goto confused; - block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) @@ -217,7 +217,7 @@ int ext4_mpage_readpages(struct address_space *mapping, set_error_page: SetPageError(page); zero_user_segment(page, 0, - PAGE_CACHE_SIZE); + PAGE_SIZE); unlock_page(page); goto next_page; } @@ -250,7 +250,7 @@ int ext4_mpage_readpages(struct address_space *mapping, } if (first_hole != blocks_per_page) { zero_user_segment(page, first_hole << blkbits, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); @@ -279,7 +279,7 @@ int ext4_mpage_readpages(struct address_space *mapping, if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = ext4_get_crypto_ctx(inode); + ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; } @@ -319,7 +319,7 @@ int ext4_mpage_readpages(struct address_space *mapping, unlock_page(page); next_page: if (pages) - page_cache_release(page); + put_page(page); } BUG_ON(pages && !list_empty(pages)); if (bio) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 34038e3598d5..cf681004b196 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb) */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " - "so online resizing is not allowed\n"); + "so online resizing is not allowed"); return -EPERM; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 539297515896..3822a5aedc61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -1113,6 +1114,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); +static int ext4_get_next_id(struct super_block *sb, struct kqid *qid); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -1129,7 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = { .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, - .get_next_id = dquot_get_next_id, + .get_next_id = ext4_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -1323,9 +1325,9 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) return -1; } if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " - "when QUOTA feature is enabled"); - return -1; + ext4_msg(sb, KERN_INFO, "Journaled quota options " + "ignored when QUOTA feature is enabled"); + return 1; } qname = match_strdup(args); if (!qname) { @@ -1688,10 +1690,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_ERR, - "Cannot set journaled quota options " + ext4_msg(sb, KERN_INFO, + "Quota format mount options ignored " "when QUOTA feature is enabled"); - return -1; + return 1; } sbi->s_jquota_fmt = m->mount_opt; #endif @@ -1756,11 +1758,11 @@ static int parse_options(char *options, struct super_block *sb, #ifdef CONFIG_QUOTA if (ext4_has_feature_quota(sb) && (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " - "feature is enabled"); - return 0; - } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " + "mount options ignored."); + clear_opt(sb, USRQUOTA); + clear_opt(sb, GRPQUOTA); + } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -1784,7 +1786,7 @@ static int parse_options(char *options, struct super_block *sb, int blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (blocksize < PAGE_CACHE_SIZE) { + if (blocksize < PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "can't mount with " "dioread_nolock if block size != PAGE_SIZE"); return 0; @@ -3415,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); - goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext4_msg(sb, KERN_ERR, - "error: device does not support dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { @@ -3808,7 +3803,7 @@ no_journal: } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && - (blocksize != PAGE_CACHE_SIZE)) { + (blocksize != PAGE_SIZE)) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs encryption"); goto failed_mount_wq; @@ -3929,6 +3924,9 @@ no_journal: if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; @@ -5028,6 +5026,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) EXT4_SB(sb)->s_jquota_fmt, type); } +static void lockdep_set_quota_inode(struct inode *inode, int subclass) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + /* The first argument of lockdep_set_subclass has to be + * *exactly* the same as the argument to init_rwsem() --- in + * this case, in init_once() --- or lockdep gets unhappy + * because the name of the lock is set using the + * stringification of the argument to init_rwsem(). + */ + (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ + lockdep_set_subclass(&ei->i_data_sem, subclass); +} + /* * Standard function to be called on quota_on */ @@ -5067,8 +5079,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (err) return err; } - - return dquot_quota_on(sb, type, format_id, path); + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); + err = dquot_quota_on(sb, type, format_id, path); + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); + return err; } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, @@ -5095,8 +5111,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); iput(qf_inode); + if (err) + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); return err; } @@ -5253,6 +5272,17 @@ out: return len; } +static int ext4_get_next_id(struct super_block *sb, struct kqid *qid) +{ + const struct quota_format_ops *ops; + + if (!sb_has_quota_loaded(sb, qid->type)) + return -ESRCH; + ops = sb_dqopt(sb)->ops[qid->type]; + if (!ops || !ops->get_next_id) + return -ENOSYS; + return dquot_get_next_id(sb, qid); +} #endif static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 6f7ee30a89ce..75ed5c2f0c16 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -80,12 +80,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, if (res <= plen) paddr[res] = '\0'; if (cpage) - page_cache_release(cpage); + put_page(cpage); set_delayed_call(done, kfree_link, paddr); return paddr; errout: if (cpage) - page_cache_release(cpage); + put_page(cpage); kfree(paddr); return ERR_PTR(res); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 0441e055c8e8..e79bd32b9b79 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -230,6 +230,27 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return error; } +static int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line) +{ + struct ext4_xattr_entry *entry = IFIRST(header); + int error = -EFSCORRUPTED; + + if (((void *) header >= end) || + (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC))) + goto errout; + error = ext4_xattr_check_names(entry, end, entry); +errout: + if (error) + __ext4_error_inode(inode, function, line, 0, + "corrupted in-inode xattr"); + return error; +} + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + static inline int ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) { @@ -341,7 +362,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(entry, end, entry); + error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_find_entry(&entry, name_index, name, @@ -477,7 +498,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); + error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), @@ -1040,8 +1061,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = ext4_xattr_check_names(IFIRST(header), is->s.end, - IFIRST(header)); + error = xattr_check_inode(inode, header, is->s.end); if (error) return error; /* Find the named attribute. */ @@ -1356,6 +1376,10 @@ retry: last = entry; total_ino = sizeof(struct ext4_xattr_ibody_header); + error = xattr_check_inode(inode, header, end); + if (error) + goto cleanup; + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); if (free >= new_extra_isize) { entry = IFIRST(header); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 3e81bdca071a..123a7d010efe 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -13,10 +13,10 @@ static int ext4_xattr_security_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name, buffer, size); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 2a3c6f9b8cb8..60652fa24cbc 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -20,10 +20,10 @@ ext4_xattr_trusted_list(struct dentry *dentry) static int ext4_xattr_trusted_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, void *buffer, - size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name, buffer, size); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index d152f431e432..17a446ffecd3 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -19,12 +19,12 @@ ext4_xattr_user_list(struct dentry *dentry) static int ext4_xattr_user_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER, + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size); } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1f8982a957f1..378c221d68a9 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -94,3 +94,11 @@ config F2FS_IO_TRACE information and block IO patterns in the filesystem level. If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c8f25f7241f0..a31c7e859af6 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -115,7 +115,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * + f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * sizeof(struct f2fs_acl_entry), GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +175,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -190,9 +190,6 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, acl = ERR_PTR(retval); kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - return acl; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0955312e5ca0..389160049993 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,6 +26,14 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *inode_entry_slab; +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + if (!end_io) + f2fs_flush_merged_bios(sbi); +} + /* * We guarantee no failure on the returned page. */ @@ -34,7 +42,7 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -64,7 +72,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, if (unlikely(!is_meta)) fio.rw &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -91,7 +99,7 @@ repeat: * meta page. */ if (unlikely(!PageUptodate(page))) - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); out: return page; } @@ -186,7 +194,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, BUG(); } - page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr); + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { @@ -211,7 +220,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) bool readahead = false; page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) + if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); @@ -448,12 +457,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_ino_entry(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; - for (i = APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -473,6 +482,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) int err = 0; spin_lock(&im->ino_lock); + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; + } +#endif if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else @@ -777,43 +793,32 @@ void update_dirty_page(struct inode *inode, struct page *page) !S_ISLNK(inode->i_mode)) return; - spin_lock(&sbi->inode_lock[type]); - __add_dirty_inode(inode, type); - inode_inc_dirty_pages(inode); - spin_unlock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) { + spin_lock(&sbi->inode_lock[type]); + __add_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); + } + inode_inc_dirty_pages(inode); SetPagePrivate(page); f2fs_trace_pid(page); } -void add_dirty_dir_inode(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - spin_lock(&sbi->inode_lock[DIR_INODE]); - __add_dirty_inode(inode, DIR_INODE); - spin_unlock(&sbi->inode_lock[DIR_INODE]); -} - void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) + return; + spin_lock(&sbi->inode_lock[type]); __remove_dirty_inode(inode, type); spin_unlock(&sbi->inode_lock[type]); - - /* Only from the recovery routine */ - if (is_inode_flag_set(fi, FI_DELAY_IPUT)) { - clear_inode_flag(fi, FI_DELAY_IPUT); - iput(inode); - } } int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) @@ -892,7 +897,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, 0, &wbc); + err = sync_node_pages(sbi, &wbc); if (err) { f2fs_unlock_all(sbi); goto out; @@ -917,7 +922,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WRITEBACK)) + if (!atomic_read(&sbi->nr_wb_bios)) break; io_schedule_timeout(5*HZ); @@ -1082,7 +1087,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); @@ -1098,7 +1103,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, discard_blk); - release_ino_entry(sbi); + release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) return -EIO; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e5c762b37239..9a8bbc1fb1fa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -68,13 +68,12 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_error)) { set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); - dec_page_count(sbi, F2FS_WRITEBACK); } - - if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait)) + if (atomic_dec_and_test(&sbi->nr_wb_bios) && + wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); @@ -98,6 +97,14 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } +static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, + struct bio *bio) +{ + if (!is_read_io(rw)) + atomic_inc(&sbi->nr_wb_bios); + submit_bio(rw, bio); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -110,7 +117,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - submit_bio(fio->rw, io->bio); + __submit_bio(io->sbi, fio->rw, io->bio); io->bio = NULL; } @@ -223,12 +230,12 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) /* Allocate a new bio */ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } - submit_bio(fio->rw, bio); + __submit_bio(fio->sbi, fio->rw, bio); return 0; } @@ -248,9 +255,6 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); - if (!is_read) - inc_page_count(sbi, F2FS_WRITEBACK); - if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || io->fio.rw != fio->rw)) __submit_merged_bio(io); @@ -265,8 +269,8 @@ alloc_new: bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < + PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } @@ -278,6 +282,16 @@ alloc_new: trace_f2fs_submit_page_mbio(fio->page, fio); } +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -286,19 +300,9 @@ alloc_new: */ void set_data_blkaddr(struct dnode_of_data *dn) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; - - f2fs_wait_on_page_writeback(node_page, NODE, true); - - rn = F2FS_NODE(node_page); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - if (set_page_dirty(node_page)) + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) dn->node_changed = true; } @@ -309,24 +313,53 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) f2fs_update_extent_cache(dn); } -int reserve_new_block(struct dnode_of_data *dn) +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + if (!count) + return 0; + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); + + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; - dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; } +/* Should keep dn->ofs_in_node unchanged */ +int reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; @@ -406,7 +439,7 @@ got_it: * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); unlock_page(page); return page; @@ -517,7 +550,7 @@ struct page *get_new_data_page(struct inode *inode, goto got_it; if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { f2fs_put_page(page, 1); @@ -530,8 +563,8 @@ struct page *get_new_data_page(struct inode *inode, } got_it: if (new_i_size && i_size_read(inode) < - ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); + ((loff_t)(index + 1) << PAGE_SHIFT)) { + i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } @@ -545,6 +578,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; int seg = CURSEG_WARM_DATA; pgoff_t fofs; + blkcnt_t count = 1; if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; @@ -553,7 +587,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; alloc: @@ -570,9 +604,9 @@ alloc: /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); + ((loff_t)(fofs + 1) << PAGE_SHIFT)); return 0; } @@ -582,8 +616,8 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; ssize_t ret = 0; - map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos); - map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from)); + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from)); map.m_next_pgofs = NULL; if (f2fs_encrypted_inode(inode)) @@ -621,8 +655,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; - pgoff_t pgofs, end_offset; + pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; struct extent_info ei; bool allocated = false; block_t blkaddr; @@ -632,6 +668,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { map->m_pblk = ei.blk + pgofs - ei.fofs; @@ -648,6 +685,8 @@ next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; if (err == -ENOENT) { err = 0; if (map->m_next_pgofs) @@ -657,6 +696,8 @@ next_dnode: goto unlock_out; } + prealloc = 0; + ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: @@ -669,31 +710,41 @@ next_block: goto sync_out; } if (flag == F2FS_GET_BLOCK_PRE_AIO) { - if (blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } } else { err = __allocate_data_block(&dn); + if (!err) { + set_inode_flag(F2FS_I(inode), + FI_APPEND_WRITE); + allocated = true; + } } if (err) goto sync_out; - allocated = true; map->m_flags = F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; + } if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; } if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; + blkaddr != NEW_ADDR) goto sync_out; - } } } + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -705,33 +756,50 @@ next_block: } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || - flag == F2FS_GET_BLOCK_PRE_DIO || - flag == F2FS_GET_BLOCK_PRE_AIO) { + flag == F2FS_GET_BLOCK_PRE_DIO) { ofs++; map->m_len++; } else { goto sync_out; } +skip: dn.ofs_in_node++; pgofs++; - if (map->m_len < maxblocks) { - if (dn.ofs_in_node < end_offset) - goto next_block; + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { - if (allocated) - sync_inode_page(&dn); - f2fs_put_dnode(&dn); + dn.ofs_in_node = ofs_in_node; + err = reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; - if (create) { - f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; } - allocated = false; - goto next_dnode; + dn.ofs_in_node = end_offset; } + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; + + if (allocated) + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); + } + allocated = false; + goto next_dnode; + sync_out: if (allocated) sync_inode_page(&dn); @@ -971,7 +1039,7 @@ got_it: goto confused; } } else { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); unlock_page(page); goto next_page; @@ -983,7 +1051,7 @@ got_it: */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio); bio = NULL; } if (bio == NULL) { @@ -992,7 +1060,7 @@ submit_and_realloc: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; @@ -1021,22 +1089,22 @@ submit_and_realloc: goto next_page; set_error_page: SetPageError(page); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); goto next_page; confused: if (bio) { - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio); bio = NULL; } unlock_page(page); next_page: if (pages) - page_cache_release(page); + put_page(page); } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio); return 0; } @@ -1092,14 +1160,24 @@ int do_write_data_page(struct f2fs_io_info *fio) } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + gfp_t gfp_flags = GFP_NOFS; /* wait for GCed encrypted page writeback */ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), fio->old_blkaddr); - - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page); +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); + if (err == -ENOMEM) { + /* flush pending ios and wait for a while */ + f2fs_flush_merged_bios(F2FS_I_SB(inode)); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + err = 0; + goto retry_encrypt; + } goto out_writepage; } } @@ -1136,7 +1214,7 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) - >> PAGE_CACHE_SHIFT; + >> PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -1157,18 +1235,20 @@ static int f2fs_write_data_page(struct page *page, * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ - offset = i_size & (PAGE_CACHE_SIZE - 1); + offset = i_size & (PAGE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); write: if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; - if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)) + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ @@ -1267,8 +1347,8 @@ next: cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -1448,11 +1528,11 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, * the block addresses when there is no need to fill the page. */ if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) && - len == PAGE_CACHE_SIZE) + len == PAGE_SIZE) return 0; if (f2fs_has_inline_data(inode) || - (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + (pos & PAGE_MASK) >= i_size_read(inode)) { f2fs_lock_op(sbi); locked = true; } @@ -1470,7 +1550,8 @@ restart: if (pos + len <= MAX_INLINE_DATA) { read_inline_data(page, ipage); set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_inline_node(ipage); + if (inode->i_nlink) + set_inline_node(ipage); } else { err = f2fs_convert_inline_page(&dn, page); if (err) @@ -1486,7 +1567,7 @@ restart: } else { /* hole case */ err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err || (!err && dn.data_blkaddr == NULL_ADDR)) { + if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); f2fs_lock_op(sbi); locked = true; @@ -1513,7 +1594,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; + pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -1561,22 +1642,22 @@ repeat: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - if (len == PAGE_CACHE_SIZE) + if (len == PAGE_SIZE) goto out_update; if (PageUptodate(page)) goto out_clear; - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_CACHE_SIZE - 1); + if ((pos & PAGE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_SIZE - 1); unsigned end = start + len; /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + zero_user_segments(page, 0, start, end, PAGE_SIZE); goto out_update; } if (blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); } else { struct f2fs_io_info fio = { .sbi = sbi, @@ -1655,12 +1736,12 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, return 0; } -static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; int err; err = check_direct_IO(inode, iter, offset); @@ -1672,9 +1753,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); - if (err < 0 && iov_iter_rw(iter) == WRITE) - f2fs_write_failed(mapping, offset + count); + err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); + if (iov_iter_rw(iter) == WRITE) { + if (err > 0) + set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + else if (err < 0) + f2fs_write_failed(mapping, offset + count); + } trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); @@ -1688,7 +1773,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && - (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)) + (offset % PAGE_SIZE || length != PAGE_SIZE)) return; if (PageDirty(page)) { @@ -1704,6 +1789,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return; + set_page_private(page, 0); ClearPagePrivate(page); } @@ -1717,6 +1803,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + set_page_private(page, 0); ClearPagePrivate(page); return 1; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 4fb6ef88a34f..d89a425055d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -48,7 +48,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -58,6 +58,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -143,6 +144,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); + si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -164,7 +166,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; - si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); @@ -192,7 +194,7 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i <= ORPHAN_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree); @@ -201,9 +203,9 @@ get_cache: si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -216,8 +218,9 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n", - si->sbi->sb->s_bdev, i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -237,6 +240,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Orphan Inode: %u\n", + si->orphans); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -295,15 +300,15 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb: %4d\n", - si->inmem_pages, si->wb_pages); - seq_printf(s, " - nodes: %4d in %4d\n", + seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + si->inmem_pages, si->wb_bios); + seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4d in dirs:%4d\n", + seq_printf(s, " - dents: %4lld in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - datas: %4d in files:%4d\n", + seq_printf(s, " - datas: %4lld in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4d in %4d\n", + seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 80641ad82745..f9313f684540 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -17,8 +17,8 @@ static unsigned long dir_blocks(struct inode *inode) { - return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) - >> PAGE_CACHE_SHIFT; + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) @@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_SYMLINK] = DT_LNK, }; -#define S_SHIFT 12 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, @@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +unsigned char get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -95,11 +101,6 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, else kunmap(dentry_page); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); return de; } @@ -124,6 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, de = &d->dentry[bit_pos]; + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -141,10 +147,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, *max_slots = max_len; max_len = 0; - /* remain bug on condition */ - if (unlikely(!de->name_len)) - d->max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -389,9 +391,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, return page; if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); err = make_empty_dir(inode, dir, page); - if (err) - goto error; + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); @@ -435,13 +442,12 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, return page; put_error: - f2fs_put_page(page, 1); -error: - /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + /* truncate empty dir pages */ truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0, false); - remove_dirty_inode(inode); - remove_inode_page(inode); + + clear_nlink(inode); + update_inode(inode, page); + f2fs_put_page(page, 1); return ERR_PTR(err); } @@ -509,11 +515,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, } } -/* - * Caller should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -526,28 +528,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; - struct fscrypt_name fname; - struct qstr new_name; - int slots, err; - - err = fscrypt_setup_filename(dir, name, 0, &fname); - if (err) - return err; - - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (!err || err != -EAGAIN) - goto out; - else - err = 0; - } + int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name); + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -556,10 +541,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { - err = -ENOSPC; - goto out; - } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_DIR_DEPTH)) + return -ENOSPC; +#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) @@ -573,10 +560,8 @@ start: for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); - goto out; - } + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -596,7 +581,7 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, &new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -606,7 +591,7 @@ add_dentry: } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -628,7 +613,34 @@ fail: } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); -out: + + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + struct qstr new_name; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + new_name.name = fname_name(&fname); + new_name.len = fname_len(&fname); + + err = -EAGAIN; + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode); + fscrypt_free_filename(&fname); f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; @@ -792,10 +804,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; + d_type = get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -804,7 +813,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; - de_name.name = kmalloc(de_name.len, GFP_NOFS); + de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS); if (!de_name.name) return false; @@ -887,6 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } + err = 0; out: fscrypt_fname_free_buffer(&fstr); return err; @@ -902,7 +912,7 @@ static int f2fs_dir_open(struct inode *inode, struct file *filp) const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = f2fs_readdir, + .iterate_shared = f2fs_readdir, .fsync = f2fs_sync_file, .open = f2fs_dir_open, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c859bb044728..5bfcdb9b69f2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -196,8 +196,7 @@ bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) if (!i_ext || !i_ext->len) return false; - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + get_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bbe2cd1265d0..916e7c238e3d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -37,6 +37,57 @@ } while (0) #endif +#ifdef CONFIG_F2FS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_PAGE_ALLOC, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_MAX, +}; + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern struct f2fs_fault_info f2fs_fault; +extern char *fault_name[FAULT_MAX]; +#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type))) + +static inline bool time_to_inject(int type) +{ + if (!f2fs_fault.inject_rate) + return false; + if (type == FAULT_KMALLOC && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_BLOCK && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) + return false; + + atomic_inc(&f2fs_fault.inject_ops); + if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) { + atomic_set(&f2fs_fault.inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + /* * For mount options */ @@ -56,6 +107,7 @@ #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -159,7 +211,6 @@ struct fsync_inode_entry { struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ - block_t last_inode; /* block address locating the last inode */ }; #define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) @@ -385,7 +436,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_pages; /* # of dirty pages */ + struct percpu_counter dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -398,11 +449,11 @@ struct f2fs_inode_info { }; static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) + struct f2fs_extent *i_ext) { - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk = le32_to_cpu(i_ext.blk); - ext->len = le32_to_cpu(i_ext.len); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_extent(struct extent_info *ext, @@ -599,7 +650,6 @@ struct f2fs_sm_info { * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, @@ -672,6 +722,7 @@ enum { SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ }; enum { @@ -680,6 +731,10 @@ enum { MAX_TIME, }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 +#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -687,6 +742,10 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ int s_flag; /* flags for sbi */ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -742,18 +801,24 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + atomic_t nr_wb_bios; /* # of writeback bios */ + + /* # of pages, see count_type */ + struct percpu_counter nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ @@ -1055,21 +1120,33 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) } static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) + struct inode *inode, blkcnt_t *count) { block_t valid_block_count; spin_lock(&sbi->stat_lock); - valid_block_count = - sbi->total_valid_block_count + (block_t)count; - if (unlikely(valid_block_count > sbi->user_block_count)) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_BLOCK)) { spin_unlock(&sbi->stat_lock); return false; } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; +#endif + valid_block_count = + sbi->total_valid_block_count + (block_t)(*count); + if (unlikely(valid_block_count > sbi->user_block_count)) { + *count = sbi->user_block_count - sbi->total_valid_block_count; + if (!*count) { + spin_unlock(&sbi->stat_lock); + return false; + } + } + /* *count can be recalculated */ + inode->i_blocks += *count; + sbi->total_valid_block_count = + sbi->total_valid_block_count + (block_t)(*count); spin_unlock(&sbi->stat_lock); + + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); return true; } @@ -1087,20 +1164,20 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_inc(&sbi->nr_pages[count_type]); + percpu_counter_inc(&sbi->nr_pages[count_type]); set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_pages); + percpu_counter_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_dec(&sbi->nr_pages[count_type]); + percpu_counter_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1109,26 +1186,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - atomic_dec(&F2FS_I(inode)->dirty_pages); + percpu_counter_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return atomic_read(&sbi->nr_pages[count_type]); + return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } -static inline int get_dirty_pages(struct inode *inode) +static inline s64 get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_pages); + return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -1217,11 +1296,11 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, if (inode) inode->i_blocks++; - sbi->alloc_valid_block_count++; sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->alloc_valid_block_count); return true; } @@ -1248,28 +1327,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - return sbi->total_valid_inode_count; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct page *page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(FAULT_PAGE_ALLOC)) + return NULL; +#endif if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -1294,7 +1375,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); unlock_page(page); } - page_cache_release(page); + put_page(page); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -1435,7 +1516,6 @@ enum { FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1618,12 +1698,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); } -static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) -{ - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; -} - static inline bool is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -1644,6 +1718,15 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } +static inline void *f2fs_kmalloc(size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_KMALLOC)) + return NULL; +#endif + return kmalloc(size, flags); +} + static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -1710,7 +1793,7 @@ struct dentry *f2fs_get_parent(struct dentry *child); */ extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); - +unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, @@ -1731,6 +1814,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); +int f2fs_add_regular_entry(struct inode *, const struct qstr *, + struct inode *, nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -1781,7 +1866,10 @@ void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +void move_node_page(struct page *, int); +int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *, + bool); +int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -1843,6 +1931,7 @@ void destroy_segment_manager_caches(void); /* * checkpoint.c */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); @@ -1852,7 +1941,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *); +void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); @@ -1861,7 +1950,6 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); void remove_dirty_inode(struct inode *); int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); @@ -1880,6 +1968,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); +int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); @@ -1906,7 +1995,7 @@ void build_gc_manager(struct f2fs_sb_info *); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *, bool); bool space_for_roll_forward(struct f2fs_sb_info *); /* @@ -1921,12 +2010,12 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - int ndirty_node, ndirty_meta; - int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages; + unsigned int ndirty_dirs, ndirty_files; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_pages; - int inline_xattr, inline_inode, inline_dir; + int bg_gc, wb_bios; + int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b41c3579ea9e..f4c0086655c4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,7 +20,7 @@ #include <linux/uaccess.h> #include <linux/mount.h> #include <linux/pagevec.h> -#include <linux/random.h> +#include <linux/uuid.h> #include "f2fs.h" #include "node.h" @@ -74,11 +74,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { unsigned offset; - offset = i_size_read(inode) & ~PAGE_CACHE_MASK; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + offset = i_size_read(inode) & ~PAGE_MASK; + zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); SetPageUptodate(page); @@ -182,7 +182,8 @@ static void try_to_fix_pino(struct inode *inode) } } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; struct f2fs_inode_info *fi = F2FS_I(inode); @@ -256,7 +257,9 @@ go_write: goto out; } sync_nodes: - sync_node_pages(sbi, ino, &wbc); + ret = fsync_node_pages(sbi, ino, &wbc, atomic); + if (ret) + goto out; /* if cp_error was enabled, we should avoid infinite loop */ if (unlikely(f2fs_cp_error(sbi))) { @@ -288,6 +291,11 @@ out: return ret; } +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + return f2fs_do_sync_file(file, start, end, datasync, false); +} + static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { @@ -346,11 +354,11 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto found; } - pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); if (err && err != -ENOENT) { @@ -370,7 +378,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, - data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); @@ -441,7 +449,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); - struct inode *dir = filp->f_path.dentry->d_parent->d_inode; + struct dentry *dir; if (!ret && f2fs_encrypted_inode(inode)) { ret = fscrypt_get_encryption_info(inode); @@ -450,9 +458,13 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } - if (f2fs_encrypted_inode(dir) && - !fscrypt_has_permitted_context(dir, inode)) + dir = dget_parent(file_dentry(filp)); + if (f2fs_encrypted_inode(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + dput(dir); return -EPERM; + } + dput(dir); return ret; } @@ -508,8 +520,8 @@ void truncate_data_blocks(struct dnode_of_data *dn) static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; @@ -529,7 +541,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); - zero_user(page, offset, PAGE_CACHE_SIZE - offset); + zero_user(page, offset, PAGE_SIZE - offset); if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) set_page_dirty(page); @@ -551,6 +563,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + if (free_from >= sbi->max_file_blocks) + goto free_partial; + if (lock) f2fs_lock_op(sbi); @@ -569,7 +584,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; @@ -592,7 +607,7 @@ free_next: out: if (lock) f2fs_unlock_op(sbi); - +free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); @@ -799,11 +814,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -813,7 +828,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; } @@ -830,8 +845,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; - blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -954,8 +969,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -982,6 +997,49 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + /* + * reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); + + return ret; +} + static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, int mode) { @@ -1006,11 +1064,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, truncate_pagecache_range(inode, offset, offset + len - 1); - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -1024,43 +1082,40 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; new_size = max_t(loff_t, new_size, - (loff_t)pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_SHIFT); } - for (index = pg_start; index < pg_end; index++) { + for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; - struct page *ipage; + unsigned int end_offset; + pgoff_t end; f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - f2fs_unlock_op(sbi); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, index); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; } - if (dn.data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); + + ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + if (ret) + goto out; + index = end; new_size = max_t(loff_t, new_size, - (loff_t)(index + 1) << PAGE_CACHE_SHIFT); + (loff_t)index << PAGE_SHIFT); } if (off_end) { @@ -1117,8 +1172,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; @@ -1143,10 +1198,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t index, pg_start, pg_end; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + pgoff_t pg_end; loff_t new_size = i_size_read(inode); - loff_t off_start, off_end; - int ret = 0; + loff_t off_end; + int ret; ret = inode_newsize_ok(inode, (len + offset)); if (ret) @@ -1158,43 +1214,35 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; - f2fs_lock_op(sbi); + ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (ret) { + pgoff_t last_off; - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; + if (!map.m_len) + return ret; - if (index == pg_end && !off_end) - goto noalloc; + last_off = map.m_lblk + map.m_len - 1; - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = f2fs_reserve_block(&dn, index); - if (ret) - break; -noalloc: - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; - else if (index == pg_end) - new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + - off_end; - else - new_size += PAGE_CACHE_SIZE; + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len: + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); update_inode_page(inode); } - f2fs_unlock_op(sbi); return ret; } @@ -1250,10 +1298,19 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); @@ -1290,20 +1347,16 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) unsigned int oldflags; int ret; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + ret = mnt_want_write_file(filp); if (ret) return ret; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - if (get_user(flags, (int __user *)arg)) { - ret = -EFAULT; - goto out; - } - flags = f2fs_mask_flags(inode->i_mode, flags); inode_lock(inode); @@ -1346,17 +1399,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) - return 0; + goto out; ret = f2fs_convert_inline_inode(inode); if (ret) - return ret; + goto out; set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; + if (!get_dirty_pages(inode)) + goto out; + + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1367,13 +1438,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - ret = mnt_want_write_file(filp); if (ret) return ret; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto err_out; + if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); @@ -1383,8 +1456,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } } - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1397,32 +1471,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (f2fs_is_volatile_file(inode)) - return 0; + goto out; ret = f2fs_convert_inline_inode(inode); if (ret) - return ret; + goto out; set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (!f2fs_is_volatile_file(inode)) - return 0; + goto out; - if (!f2fs_is_first_block_written(inode)) - return truncate_partial_data_page(inode, 0, true); + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } - return punch_hole(inode, 0, F2FS_BLKSIZE); + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1437,15 +1533,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); - } if (f2fs_is_volatile_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } + inode_unlock(inode); + mnt_drop_write_file(filp); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -1457,6 +1555,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1464,31 +1563,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); if (sb && !IS_ERR(sb)) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ f2fs_sync_fs(sb, 1); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: sync_meta_pages(sbi, META, LONG_MAX); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } f2fs_update_time(sbi, REQ_TIME); - return 0; +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -1509,9 +1615,14 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + range.minlen = max((unsigned int)range.minlen, q->limits.discard_granularity); ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); if (ret < 0) return ret; @@ -1536,13 +1647,21 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); + int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(inode, &policy); + ret = fscrypt_process_policy(inode, &policy); + + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) @@ -1599,6 +1718,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 sync; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1609,20 +1729,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) - return -EBUSY; + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } } else { mutex_lock(&sbi->gc_mutex); } - return f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1630,7 +1760,14 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - return f2fs_sync_fs(sbi->sb, 1); + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = f2fs_sync_fs(sbi->sb, 1); + + mnt_drop_write_file(filp); + return ret; } static int f2fs_defragment_range(struct f2fs_sb_info *sbi, @@ -1652,8 +1789,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, if (need_inplace_update(inode)) return -EINVAL; - pg_start = range->start >> PAGE_CACHE_SHIFT; - pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT; + pg_start = range->start >> PAGE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_SHIFT; f2fs_balance_fs(sbi, true); @@ -1770,7 +1907,7 @@ clear_out: out: inode_unlock(inode); if (!err) - range->len = (u64)total << PAGE_CACHE_SHIFT; + range->len = (u64)total << PAGE_SHIFT; return err; } @@ -1882,13 +2019,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } inode_unlock(inode); - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(file, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0051a97824c..38d56f678912 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -96,7 +96,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -465,15 +465,7 @@ next_step: continue; } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE, true); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } - f2fs_put_page(node_page, 1); + move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } @@ -834,18 +826,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_put_page(sum_page, 0); } - if (gc_type == FG_GC) { - if (type == SUM_TYPE_NODE) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, 0, &wbc); - } else { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - } - } + if (gc_type == FG_GC) + f2fs_submit_merged_bio(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); blk_finish_plug(&plug); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 358214e9f707..a4bb155dd00a 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -51,7 +51,7 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); /* Copy the whole inline data block */ src_addr = inline_data_addr(ipage); @@ -93,7 +93,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } if (page->index) - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); else read_inline_data(page, ipage); @@ -161,7 +161,7 @@ int f2fs_convert_inline_inode(struct inode *inode) if (!f2fs_has_inline_data(inode)) return 0; - page = grab_cache_page(inode->i_mapping, 0); + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -303,11 +303,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, else f2fs_put_page(ipage, 0); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(sbi, d.max < 0); return de; } @@ -355,7 +350,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct page *page; @@ -363,7 +358,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_dentry_block *dentry_blk; int err; - page = grab_cache_page(dir->i_mapping, 0); + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -375,7 +370,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); dentry_blk = kmap_atomic(page); @@ -405,8 +400,9 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, stat_dec_inline_dir(dir); clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); - if (i_size_read(dir) < PAGE_CACHE_SIZE) { - i_size_write(dir, PAGE_CACHE_SIZE); + F2FS_I(dir)->i_current_depth = 1; + if (i_size_read(dir) < PAGE_SIZE) { + i_size_write(dir, PAGE_SIZE); set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } @@ -416,6 +412,105 @@ out: return err; } +static int f2fs_add_inline_entries(struct inode *dir, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + new_name.name = d.filename[bit_pos]; + new_name.len = de->name_len; + + ino = le32_to_cpu(de->ino); + fake_mode = get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + truncate_blocks(dir, 0, false); + remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_inline_dentry *backup_dentry; + struct f2fs_inode_info *fi = F2FS_I(dir); + int err; + + backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry), + GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + truncate_inline_inode(ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(fi, FI_INLINE_DENTRY); + update_inode(dir, ipage); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + fi->i_current_depth = 0; + i_size_write(dir, MAX_INLINE_DATA); + update_inode(dir, ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cb269c46ac25..2e68adab0d64 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -283,7 +283,7 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); } return 0; } @@ -344,7 +344,7 @@ void f2fs_evict_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); set_inode_flag(fi, FI_NO_ALLOC); i_size_write(inode, 0); - +retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode, true); @@ -354,6 +354,12 @@ void f2fs_evict_inode(struct inode *inode) f2fs_unlock_op(sbi); } + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); @@ -368,26 +374,11 @@ no_delete: if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) add_ino_entry(sbi, inode->i_ino, UPDATE_INO); if (is_inode_flag_set(fi, FI_FREE_NID)) { - if (err && err != -ENOENT) - alloc_nid_done(sbi, inode->i_ino); - else - alloc_nid_failed(sbi, inode->i_ino); + alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(fi, FI_FREE_NID); } - - if (err && err != -ENOENT) { - if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { - /* - * get here because we failed to release resource - * of inode previously, reminder our user to run fsck - * for fixing. - */ - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "inode (ino:%lu) resource leak, run fsck " - "to fix this issue!", inode->i_ino); - } - } + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); @@ -397,37 +388,32 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int err = 0; + struct node_info ni; - clear_nlink(inode); - make_bad_inode(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); - i_size_write(inode, 0); - if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, false); - - if (!err) - err = remove_inode_page(inode); - /* - * if we skip truncate_node in remove_inode_page bacause we failed - * before, it's better to find another way to release resource of - * this inode (e.g. valid block count, node block or nid). Here we - * choose to add this inode to orphan list, so that we can call iput - * for releasing in orphan recovery flow. - * * Note: we should add inode to orphan list before f2fs_unlock_op() * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - if (err && err != -ENOENT) { - err = acquire_orphan_inode(sbi); - if (!err) + get_node_info(sbi, inode->i_ino, &ni); + + if (ni.blk_addr != NULL_ADDR) { + int err = acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { add_orphan_inode(sbi, inode->i_ino); + } + alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(F2FS_I(inode), FI_FREE_NID); } - set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7876f1052101..324ed3812f30 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -202,7 +202,7 @@ struct dentry *f2fs_get_parent(struct dentry *child) unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } static int __recover_dot_dentries(struct inode *dir, nid_t pino) @@ -1027,12 +1027,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, goto errout; } - /* this is broken symlink case */ - if (unlikely(cstr.name[0] == 0)) { - res = -ENOENT; - goto errout; - } - if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ res = -EIO; @@ -1046,17 +1040,23 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, if (res < 0) goto errout; + /* this is broken symlink case */ + if (unlikely(pstr.name[0] == 0)) { + res = -ENOENT; + goto errout; + } + paddr = pstr.name; /* Null-terminate the name */ paddr[res] = '\0'; - page_cache_release(cpage); + put_page(cpage); set_delayed_call(done, kfree_link, paddr); return paddr; errout: fscrypt_fname_free_buffer(&pstr); - page_cache_release(cpage); + put_page(cpage); return ERR_PTR(res); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 118321bd1a7f..1f21aae80c40 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,11 +46,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) */ if (type == FREE_NIDS) { mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) @@ -62,13 +62,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) for (i = 0; i <= UPDATE_INO; i++) mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; + sizeof(struct ino_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * - sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) @@ -121,7 +121,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -407,6 +407,29 @@ cache: up_write(&nm_i->nat_tree_lock); } +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) { const long direct_index = ADDRS_PER_INODE(dn->inode); @@ -707,6 +730,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } + ra_node_pages(page, ofs, NIDS_PER_BLOCK); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { @@ -784,6 +809,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } + ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); @@ -832,7 +859,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); -restart: + page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); @@ -896,10 +923,7 @@ skip_partial: if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto restart; - } + BUG_ON(page->mapping != NODE_MAPPING(sbi)); f2fs_wait_on_page_writeback(page, NODE, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); @@ -998,7 +1022,7 @@ struct page *new_node_page(struct dnode_of_data *dn, if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1090,7 +1114,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (apage) return; - apage = grab_cache_page(NODE_MAPPING(sbi), nid); + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!apage) return; @@ -1098,29 +1122,6 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -/* - * readahead MAX_RA_NODE number of node pages. - */ -static void ra_node_pages(struct page *parent, int start) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; - int i, end; - nid_t nid; - - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start; i < end; i++) { - nid = get_nid(parent, i, false); - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); -} - static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, struct page *parent, int start) { @@ -1131,7 +1132,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, return ERR_PTR(-ENOENT); f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1144,7 +1145,7 @@ repeat: } if (parent) - ra_node_pages(parent, start + 1); + ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); @@ -1196,19 +1197,17 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct page *page; + int ret; /* should flush inline_data before evict_inode */ inode = ilookup(sbi->sb, ino); if (!inode) return; - page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0); + page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); if (!page) goto iput_out; - if (!trylock_page(page)) - goto release_out; - if (!PageUptodate(page)) goto page_out; @@ -1218,24 +1217,214 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!clear_page_dirty_for_io(page)) goto page_out; - if (!f2fs_write_inline_data(inode, page)) - inode_dec_dirty_pages(inode); - else + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + if (ret) set_page_dirty(page); page_out: - unlock_page(page); -release_out: - f2fs_put_page(page, 0); + f2fs_put_page(page, 1); iput_out: iput(inode); } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(sbi, PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; struct pagevec pvec; - int step = ino ? 2 : 0; + struct page *last_page = NULL; + + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, + struct writeback_control *wbc, bool atomic) +{ + pgoff_t index, end; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return -EIO; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } + return ret ? -EIO: 0; +} + +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +{ + pgoff_t index, end; + struct pagevec pvec; + int step = 0; int nwritten = 0; pagevec_init(&pvec, 0); @@ -1274,15 +1463,8 @@ next_step: if (step == 2 && (!IS_DNODE(page) || !is_cold_node(page))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ lock_node: - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) + if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { @@ -1290,8 +1472,6 @@ continue_unlock: unlock_page(page); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; if (!PageDirty(page)) { /* someone wrote it for us */ @@ -1299,7 +1479,7 @@ continue_unlock: } /* flush inline_data */ - if (!ino && is_inline_node(page)) { + if (is_inline_node(page)) { clear_inline_node(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); @@ -1312,17 +1492,8 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, - need_dentry_mark(sbi, ino)); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); - } + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); @@ -1470,7 +1641,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; - sync_node_pages(sbi, 0, wbc); + sync_node_pages(sbi, wbc); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1524,7 +1695,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - bool allocated = false; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1538,8 +1708,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - allocated = true; - if (allocated) return 0; } @@ -1672,6 +1840,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_ALLOC_NID)) + return false; +#endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; @@ -1846,7 +2018,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - ipage = grab_cache_page(NODE_MAPPING(sbi), ino); + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) return -ENOMEM; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 0b30cd2aeebd..3d7216d7a288 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; return true; } @@ -67,7 +68,30 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct inode *inode, struct page *ipage) +static struct fsync_inode_entry *add_fsync_inode(struct list_head *head, + struct inode *inode) +{ + struct fsync_inode_entry *entry; + + entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + if (!entry) + return NULL; + + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +} + +static void del_fsync_inode(struct fsync_inode_entry *entry) +{ + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); @@ -75,18 +99,29 @@ static int recover_dentry(struct inode *inode, struct page *ipage) struct qstr name; struct page *page; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } + + entry = add_fsync_inode(dir_list, dir); + if (!entry) { + err = -ENOMEM; + iput(dir); + goto out; + } } - if (file_enc_name(inode)) { - iput(dir); + dir = entry->inode; + + if (file_enc_name(inode)) return 0; - } name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; @@ -94,7 +129,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage) if (unlikely(name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out_err; + goto out; } retry: de = f2fs_find_entry(dir, &name, &page); @@ -120,23 +155,12 @@ retry: goto retry; } err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - if (err) - goto out_err; - - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { - iput(dir); - } else { - add_dirty_dir_inode(dir); - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); - } goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); -out_err: - iput(dir); out: f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", @@ -198,6 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; + struct inode *inode; struct page *page = NULL; block_t blkaddr; int err = 0; @@ -206,8 +231,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR, true); - while (1) { struct fsync_inode_entry *entry; @@ -233,35 +256,32 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) break; } - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) { - err = -ENOMEM; - break; - } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + inode = f2fs_iget(sbi->sb, ino_of_node(page)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); if (err == -ENOENT) { err = 0; goto next; } break; } - list_add_tail(&entry->list, head); + + /* add this fsync inode to the list */ + entry = add_fsync_inode(head, inode); + if (!entry) { + err = -ENOMEM; + iput(inode); + break; + } } entry->blkaddr = blkaddr; - if (IS_INODE(page)) { - entry->last_inode = blkaddr; - if (is_dent_dnode(page)) - entry->last_dentry = blkaddr; - } + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -277,11 +297,8 @@ static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -444,8 +461,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, */ if (dest == NEW_ADDR) { truncate_data_blocks_range(&dn, 1); - err = reserve_new_block(&dn); - f2fs_bug_on(sbi, err); + reserve_new_block(&dn); continue; } @@ -454,6 +470,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); +#ifdef CONFIG_F2FS_FAULT_INJECTION + while (err) + err = reserve_new_block(&dn); +#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); } @@ -486,7 +506,8 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *dir_list) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; @@ -513,7 +534,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) break; } - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(page)); if (!entry) goto next; /* @@ -521,10 +542,10 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (entry->last_inode == blkaddr) + if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page); + err = recover_dentry(entry->inode, page, dir_list); if (err) { f2fs_put_page(page, 1); break; @@ -536,11 +557,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) break; } - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + if (entry->blkaddr == blkaddr) + del_fsync_inode(entry); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -551,12 +569,14 @@ next: return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + struct list_head dir_list; block_t blkaddr; int err; + int ret = 0; bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -565,6 +585,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -573,25 +594,26 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); - if (err) + if (err || list_empty(&inode_list)) goto out; - if (list_empty(&inode_list)) + if (check_only) { + ret = 1; goto out; + } need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list); + err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); @@ -625,5 +647,8 @@ out: } else { mutex_unlock(&sbi->cp_mutex); } - return err; + + destroy_fsync_dnodes(&dir_list); + kmem_cache_destroy(fsync_entry_slab); + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6f16b39f0b52..2e6f537a0e7d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -223,9 +223,11 @@ static int __revoke_inmem_pages(struct inode *inode, f2fs_put_dnode(&dn); } next: - ClearPageUptodate(page); + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) + ClearPageUptodate(page); set_page_private(page, 0); - ClearPageUptodate(page); + ClearPagePrivate(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -239,6 +241,8 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); @@ -885,12 +889,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) } } - sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -909,9 +913,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) void *dst = page_address(page); if (src) - memcpy(dst, src, PAGE_CACHE_SIZE); + memcpy(dst, src, PAGE_SIZE); else - memset(dst, 0, PAGE_CACHE_SIZE); + memset(dst, 0, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } @@ -1596,7 +1600,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1757,7 +1761,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1844,7 +1848,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -2171,7 +2175,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 975c33df65c7..7a756ff5a36d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -158,16 +158,17 @@ struct victim_sel_policy { }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 15bb81f8dac2..74cc8520b8b1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,30 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +#ifdef CONFIG_F2FS_FAULT_INJECTION +struct f2fs_fault_info f2fs_fault; + +char *fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", +}; + +static void f2fs_build_fault_attr(unsigned int rate) +{ + if (rate) { + atomic_set(&f2fs_fault.inject_ops, 0); + f2fs_fault.inject_rate = rate; + f2fs_fault.inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info)); + } +} +#endif + /* f2fs-wide shrinker description */ static struct shrinker f2fs_shrinker_info = { .scan_objects = f2fs_shrink_scan, @@ -68,6 +92,7 @@ enum { Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, + Opt_fault_injection, Opt_err, }; @@ -93,6 +118,7 @@ static match_table_t f2fs_tokens = { {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, + {Opt_fault_injection, "fault_injection=%u"}, {Opt_err, NULL}, }; @@ -102,6 +128,10 @@ enum { SM_INFO, /* struct f2fs_sm_info */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif }; struct f2fs_attr { @@ -123,6 +153,11 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&f2fs_fault; +#endif return NULL; } @@ -172,6 +207,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif *ui = t; return count; } @@ -237,6 +276,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -273,6 +316,22 @@ static struct kobj_type f2fs_ktype = { .release = f2fs_sb_release, }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +/* sysfs for f2fs fault injection */ +static struct kobject f2fs_fault_inject; + +static struct attribute *f2fs_fault_attrs[] = { + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), + NULL +}; + +static struct kobj_type f2fs_fault_ktype = { + .default_attrs = f2fs_fault_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; +#endif + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -300,6 +359,10 @@ static int parse_options(struct super_block *sb, char *options) char *p, *name; int arg = 0; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(0); +#endif + if (!options) return 0; @@ -433,6 +496,16 @@ static int parse_options(struct super_block *sb, char *options) case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(arg); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -453,9 +526,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); + if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { + kmem_cache_free(f2fs_inode_cachep, fi); + return NULL; + } + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -530,15 +607,27 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { + percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_COUNT_TYPE; i++) + percpu_counter_destroy(&sbi->nr_pages[i]); + percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); @@ -568,15 +657,14 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_ino_entry(sbi); + release_ino_entry(sbi, true); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - if (get_pages(sbi, F2FS_WRITEBACK)) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_bios(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -593,6 +681,8 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + + destroy_percpu_info(sbi); kfree(sbi); } @@ -745,19 +835,47 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_info_open_fs(struct inode *inode, struct file *file) +static int segment_bits_seq_show(struct seq_file *seq, void *offset) { - return single_open(file, segment_info_seq_show, PDE_DATA(inode)); + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, 1)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, "%x ", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; } -static const struct file_operations f2fs_seq_segment_info_fops = { - .owner = THIS_MODULE, - .open = segment_info_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .owner = THIS_MODULE, \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ }; +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -791,13 +909,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; - if (*flags & MS_RDONLY) { - set_opt(sbi, FASTBOOT); - set_sbi_flag(sbi, SBI_IS_DIRTY); + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sync_filesystem(sb); - sbi->mount_opt.opt = 0; default_options(sbi); @@ -829,7 +949,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); need_restart_gc = true; } } else if (!sbi->gc_thread) { @@ -839,6 +958,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } + if (*flags & MS_RDONLY) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -852,8 +981,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } skip: /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; restore_gc: if (need_restart_gc) { @@ -893,6 +1023,12 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } +static int f2fs_key_prefix(struct inode *inode, u8 **key) +{ + *key = F2FS_I_SB(inode)->key_prefix; + return F2FS_I_SB(inode)->key_prefix_size; +} + static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -909,6 +1045,7 @@ static unsigned f2fs_max_namelen(struct inode *inode) static struct fscrypt_operations f2fs_cryptops = { .get_context = f2fs_get_context, + .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, @@ -984,9 +1121,26 @@ static loff_t max_file_blocks(void) return result; } -static inline bool sanity_check_area_boundary(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int __f2fs_commit_super(struct buffer_head *bh, + struct f2fs_super_block *super) { + lock_buffer(bh); + if (super) + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); @@ -1000,6 +1154,10 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); u32 segment_count = le32_to_cpu(raw_super->segment_count); u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + (segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + (segment_count << log_blocks_per_seg); if (segment0_blkaddr != cp_blkaddr) { f2fs_msg(sb, KERN_INFO, @@ -1044,22 +1202,47 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, return true; } - if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != - segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + if (main_end_blkaddr > seg_end_blkaddr) { f2fs_msg(sb, KERN_INFO, - "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", main_blkaddr, - segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment0_blkaddr + + (segment_count << log_blocks_per_seg), segment_count_main << log_blocks_per_seg); return true; + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(bh, NULL); + res = err ? "failed" : "done"; + } + f2fs_msg(sb, KERN_INFO, + "Fix alignment : %s, start(%u) end(%u) block(%u)", + res, main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + if (err) + return true; } - return false; } -static int sanity_check_raw_super(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) { + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; unsigned int blocksize; if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { @@ -1070,10 +1253,10 @@ static int sanity_check_raw_super(struct super_block *sb, } /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + if (F2FS_BLKSIZE != PAGE_SIZE) { f2fs_msg(sb, KERN_INFO, "Invalid page_cache_size (%lu), supports only 4KB\n", - PAGE_CACHE_SIZE); + PAGE_SIZE); return 1; } @@ -1126,7 +1309,7 @@ static int sanity_check_raw_super(struct super_block *sb, } /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ - if (sanity_check_area_boundary(sb, raw_super)) + if (sanity_check_area_boundary(sbi, bh)) return 1; return 0; @@ -1158,7 +1341,6 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1178,9 +1360,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->cur_victim_sec = NULL_SECNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - for (i = 0; i < NR_COUNT_TYPE; i++) - atomic_set(&sbi->nr_pages[i], 0); - sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; @@ -1188,6 +1367,30 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; +#endif +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int i, err; + + for (i = 0; i < NR_COUNT_TYPE; i++) { + err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); + if (err) + return err; + } + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + return percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); } /* @@ -1196,13 +1399,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) * to get the first valid one. If any one of them is broken, we pass * them recovery flag back to the caller. */ -static int read_raw_super_block(struct super_block *sb, +static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, int *valid_super_block, int *recovery) { + struct super_block *sb = sbi->sb; int block; struct buffer_head *bh; - struct f2fs_super_block *super, *buf; + struct f2fs_super_block *super; int err = 0; super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); @@ -1218,11 +1422,8 @@ static int read_raw_super_block(struct super_block *sb, continue; } - buf = (struct f2fs_super_block *) - (bh->b_data + F2FS_SUPER_OFFSET); - /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, buf)) { + if (sanity_check_raw_super(sbi, bh)) { f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); @@ -1232,7 +1433,8 @@ static int read_raw_super_block(struct super_block *sb, } if (!*raw_super) { - memcpy(super, buf, sizeof(*super)); + memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, + sizeof(*super)); *valid_super_block = block; *raw_super = super; } @@ -1252,42 +1454,35 @@ static int read_raw_super_block(struct super_block *sb, return err; } -static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block) +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi); struct buffer_head *bh; int err; - bh = sb_getblk(sbi->sb, block); + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } + + /* write back-up superblock first */ + bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); if (!bh) return -EIO; - - lock_buffer(bh); - memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); - set_buffer_uptodate(bh); - set_buffer_dirty(bh); - unlock_buffer(bh); - - /* it's rare case, we can do fua all the time */ - err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); brelse(bh); - return err; -} - -int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) -{ - int err; - - /* write back-up superblock first */ - err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1); - /* if we are in recovery path, skip writing valid superblock */ if (recover || err) return err; /* write current valid superblock */ - return __f2fs_commit_super(sbi, sbi->valid_super_block); + bh = sb_getblk(sbi->sb, sbi->valid_super_block); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); + return err; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -1295,7 +1490,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; struct inode *root; - long err; + int err; bool retry = true, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; @@ -1312,6 +1507,8 @@ try_onemore: if (!sbi) return -ENOMEM; + sbi->sb = sb; + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -1327,7 +1524,7 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &valid_super_block, + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, &recovery); if (err) goto free_sbi; @@ -1362,7 +1559,6 @@ try_onemore: memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->sb = sb; sbi->raw_super = raw_super; sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); @@ -1387,6 +1583,10 @@ try_onemore: init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = init_percpu_info(sbi); + if (err) + goto free_options; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { @@ -1403,13 +1603,13 @@ try_onemore: sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); spin_lock_init(&sbi->inode_lock[i]); @@ -1442,7 +1642,7 @@ try_onemore: seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) sbi->kbytes_written = - le64_to_cpu(seg_i->sum_blk->journal.info.kbytes_written); + le64_to_cpu(seg_i->journal->info.kbytes_written); build_gc_manager(sbi); @@ -1487,9 +1687,12 @@ try_onemore: if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - if (sbi->s_proc) + if (sbi->s_proc) { proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); @@ -1513,14 +1716,24 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - err = recover_fsync_data(sbi); - if (err) { + err = recover_fsync_data(sbi, false); + if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + "Cannot recover all fsync data errno=%d", err); + goto free_kobj; + } + } else { + err = recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); goto free_kobj; } } + /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1537,10 +1750,10 @@ try_onemore: kfree(options); /* recover broken superblock */ - if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { + if (recovery) { err = f2fs_commit_super(sbi, true); f2fs_msg(sb, KERN_INFO, - "Try to recover %dth superblock, ret: %ld", + "Try to recover %dth superblock, ret: %d", sbi->valid_super_block ? 1 : 2, err); } @@ -1555,6 +1768,7 @@ free_kobj: free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); @@ -1575,6 +1789,7 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); free_options: + destroy_percpu_info(sbi); kfree(options); free_sb_buf: kfree(raw_super); @@ -1660,6 +1875,16 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_fault_inject.kset = f2fs_kset; + f2fs_build_fault_attr(0); + err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype, + NULL, "fault_injection"); + if (err) { + f2fs_fault_inject.kset = NULL; + goto free_kset; + } +#endif err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_kset; @@ -1678,6 +1903,10 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_kset: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (f2fs_fault_inject.kset) + kobject_put(&f2fs_fault_inject); +#endif kset_unregister(f2fs_kset); free_extent_cache: destroy_extent_cache(); @@ -1697,14 +1926,17 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); + unregister_shrinker(&f2fs_shrinker_info); +#ifdef CONFIG_F2FS_FAULT_INJECTION + kobject_put(&f2fs_fault_inject); +#endif + kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); - kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 06a72dc0191a..00ea56797258 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -26,10 +26,10 @@ #include "xattr.h" static int f2fs_xattr_generic_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, void *buffer, - size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); switch (handler->flags) { case F2FS_XATTR_INDEX_USER: @@ -45,7 +45,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, default: return -EINVAL; } - return f2fs_getxattr(d_inode(dentry), handler->flags, name, + return f2fs_getxattr(inode, handler->flags, name, buffer, size, NULL); } @@ -86,11 +86,9 @@ static bool f2fs_xattr_trusted_list(struct dentry *dentry) } static int f2fs_xattr_advise_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, void *buffer, - size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct inode *inode = d_inode(dentry); - if (buffer) *((char *)buffer) = F2FS_I(inode)->i_advise; return sizeof(char); @@ -500,7 +498,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, free = free + ENTRY_SIZE(here); if (unlikely(free < newsize)) { - error = -ENOSPC; + error = -E2BIG; goto exit; } } @@ -528,7 +526,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, * Before we come here, old entry is removed. * We just write new entry. */ - memset(last, 0, newsize); last->e_name_index = index; last->e_name_len = len; memcpy(last->e_name, name, len); diff --git a/fs/fat/dir.c b/fs/fat/dir.c index d0b95c95079b..663e428596c6 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, buf.dirent = dirent; buf.result = 0; - inode_lock(inode); + inode_lock_shared(inode); buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, short_only, both ? &buf : NULL); file->f_pos = buf.ctx.pos; } - inode_unlock(inode); + inode_unlock_shared(inode); if (ret >= 0) ret = buf.result; return ret; @@ -861,7 +861,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, const struct file_operations fat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = fat_readdir, + .iterate_shared = fat_readdir, .unlocked_ioctl = fat_dir_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 226281068a46..3bcf57925dca 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -244,13 +244,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping, return err; } -static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; if (iov_iter_rw(iter) == WRITE) { @@ -272,7 +272,7 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) fat_write_failed(mapping, offset + count); diff --git a/fs/file.c b/fs/file.c index 1fbc5c0555a9..6b1acdfe59da 100644 --- a/fs/file.c +++ b/fs/file.c @@ -784,6 +784,11 @@ unsigned long __fdget_pos(unsigned int fd) return v; } +void __f_unlock_pos(struct file *f) +{ + mutex_unlock(&f->f_pos_lock); +} + /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index cb84f0fcc72a..bfc780c682fb 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -66,11 +66,11 @@ static int vxfs_immed_readpage(struct file *fp, struct page *pp) { struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); - u_int64_t offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT; + u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT; caddr_t kaddr; kaddr = kmap(pp); - memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE); + memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE); kunmap(pp); flush_dcache_page(pp); diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 1cff72df0389..6d576b97f2c8 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -45,7 +45,7 @@ /* * Number of VxFS blocks per page. */ -#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_CACHE_SIZE / (sbp)->s_blocksize)) +#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_SIZE / (sbp)->s_blocksize)) static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int); @@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = { const struct file_operations vxfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = vxfs_readdir, + .iterate_shared = vxfs_readdir, }; static inline u_long @@ -175,7 +175,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) if (de) { ino = de->d_ino; kunmap(pp); - page_cache_release(pp); + put_page(pp); } return (ino); @@ -255,8 +255,8 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx) nblocks = dir_blocks(ip); pblocks = VXFS_BLOCK_PER_PAGE(sbp); - page = pos >> PAGE_CACHE_SHIFT; - offset = pos & ~PAGE_CACHE_MASK; + page = pos >> PAGE_SHIFT; + offset = pos & ~PAGE_MASK; block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; for (; page < npages; page++, block = 0) { @@ -289,7 +289,7 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx) continue; offset = (char *)de - kaddr; - ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; + ctx->pos = ((page << PAGE_SHIFT) | offset) + 2; if (!dir_emit(ctx, de->d_name, de->d_namelen, de->d_ino, DT_UNKNOWN)) { vxfs_put_page(pp); @@ -301,6 +301,6 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx) vxfs_put_page(pp); offset = 0; } - ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; + ctx->pos = ((page << PAGE_SHIFT) | offset) + 2; return 0; } diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index 5d318c44f855..e806694d4145 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -50,7 +50,7 @@ inline void vxfs_put_page(struct page *pp) { kunmap(pp); - page_cache_release(pp); + put_page(pp); } /** diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index fee81e8768c9..989a2cef6b76 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -33,7 +33,7 @@ /* * 4MB minimal write chunk size */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) struct wb_completion { atomic_t cnt; @@ -931,7 +931,8 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ - work = kzalloc(sizeof(*work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), + GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (!work) { trace_writeback_nowork(wb); wb_wakeup(wb); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 6b35fc4860a0..3078b679fcd1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -113,7 +113,7 @@ try_again: wake_up_bit(&cookie->flags, 0); if (xpage) - page_cache_release(xpage); + put_page(xpage); __fscache_uncache_page(cookie, page); return true; @@ -164,7 +164,7 @@ static void fscache_end_page_write(struct fscache_object *object, } spin_unlock(&object->lock); if (xpage) - page_cache_release(xpage); + put_page(xpage); } /* @@ -884,7 +884,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) spin_unlock(&cookie->stores_lock); for (i = n - 1; i >= 0; i--) - page_cache_release(results[i]); + put_page(results[i]); } _leave(""); @@ -982,7 +982,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, radix_tree_tag_set(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG); - page_cache_get(page); + get_page(page); /* we only want one writer at a time, but we do need to queue new * writers after exclusive ops */ @@ -1026,7 +1026,7 @@ submit_failed: radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); wake_cookie = __fscache_unuse_cookie(cookie); - page_cache_release(page); + put_page(page); ret = -ENOBUFS; goto nobufs; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ebb5e37455a0..cbece1221417 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -897,7 +897,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) return err; } - page_cache_get(newpage); + get_page(newpage); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) lru_cache_add_file(newpage); @@ -912,12 +912,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (err) { unlock_page(newpage); - page_cache_release(newpage); + put_page(newpage); return err; } unlock_page(oldpage); - page_cache_release(oldpage); + put_page(oldpage); cs->len = 0; return 0; @@ -951,7 +951,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, fuse_copy_finish(cs); buf = cs->pipebufs; - page_cache_get(page); + get_page(page); buf->page = page; buf->offset = offset; buf->len = count; @@ -1435,7 +1435,7 @@ out_unlock: out: for (; page_nr < cs.nr_segs; page_nr++) - page_cache_release(bufs[page_nr].page); + put_page(bufs[page_nr].page); kfree(bufs); return ret; @@ -1632,8 +1632,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, goto out_up_killsb; mapping = inode->i_mapping; - index = outarg.offset >> PAGE_CACHE_SHIFT; - offset = outarg.offset & ~PAGE_CACHE_MASK; + index = outarg.offset >> PAGE_SHIFT; + offset = outarg.offset & ~PAGE_MASK; file_size = i_size_read(inode); end = outarg.offset + outarg.size; if (end > file_size) { @@ -1652,13 +1652,13 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (!page) goto out_iput; - this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + this_num = min_t(unsigned, num, PAGE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); if (!err && offset == 0 && - (this_num == PAGE_CACHE_SIZE || file_size == end)) + (this_num == PAGE_SIZE || file_size == end)) SetPageUptodate(page); unlock_page(page); - page_cache_release(page); + put_page(page); if (err) goto out_iput; @@ -1697,7 +1697,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, size_t total_len = 0; int num_pages; - offset = outarg->offset & ~PAGE_CACHE_MASK; + offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); num = outarg->size; @@ -1720,7 +1720,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; - index = outarg->offset >> PAGE_CACHE_SHIFT; + index = outarg->offset >> PAGE_SHIFT; while (num && req->num_pages < num_pages) { struct page *page; @@ -1730,7 +1730,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, if (!page) break; - this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + this_num = min_t(unsigned, num, PAGE_SIZE - offset); req->pages[req->num_pages] = page; req->page_descs[req->num_pages].length = this_num; req->num_pages++; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4b855b65d457..b9419058108f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1162,7 +1162,6 @@ static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, u64 attr_version) { - int err; struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; struct dentry *parent = file->f_path.dentry; @@ -1172,6 +1171,7 @@ static int fuse_direntplus_link(struct file *file, struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (!o->nodeid) { /* @@ -1204,65 +1204,61 @@ static int fuse_direntplus_link(struct file *file, name.hash = full_name_hash(name.name, name.len); dentry = d_lookup(parent, &name); - if (dentry) { + if (!dentry) { +retry: + dentry = d_alloc_parallel(parent, &name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (!d_in_lookup(dentry)) { + struct fuse_inode *fi; inode = d_inode(dentry); - if (!inode) { - d_drop(dentry); - } else if (get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + if (!inode || + get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { d_invalidate(dentry); - } else if (is_bad_inode(inode)) { - err = -EIO; - goto out; - } else { - struct fuse_inode *fi; - fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->nlookup++; - spin_unlock(&fc->lock); - - fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), - attr_version); - - /* - * The other branch to 'found' comes via fuse_iget() - * which bumps nlookup inside - */ - goto found; + dput(dentry); + goto retry; + } + if (is_bad_inode(inode)) { + dput(dentry); + return -EIO; } - dput(dentry); - } - - dentry = d_alloc(parent, &name); - err = -ENOMEM; - if (!dentry) - goto out; - inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), attr_version); - if (!inode) - goto out; + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); - alias = d_splice_alias(inode, dentry); - err = PTR_ERR(alias); - if (IS_ERR(alias)) - goto out; + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* + * The other branch comes via fuse_iget() + * which bumps nlookup inside + */ + } else { + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), + attr_version); + if (!inode) + inode = ERR_PTR(-ENOMEM); - if (alias) { - dput(dentry); - dentry = alias; + alias = d_splice_alias(inode, dentry); + d_lookup_done(dentry); + if (alias) { + dput(dentry); + dentry = alias; + } + if (IS_ERR(dentry)) + return PTR_ERR(dentry); } - -found: if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); fuse_change_entry_timeout(dentry, o); - err = 0; -out: dput(dentry); - return err; + return 0; } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, @@ -1759,10 +1755,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name, return err; } -static ssize_t fuse_getxattr(struct dentry *entry, const char *name, - void *value, size_t size) +static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode, + const char *name, void *value, size_t size) { - struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; @@ -1893,7 +1888,7 @@ static const struct inode_operations fuse_dir_inode_operations = { static const struct file_operations fuse_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = fuse_readdir, + .iterate_shared = fuse_readdir, .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9dde38f12c07..9154f8679024 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -348,7 +348,7 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t curr_index; BUG_ON(req->inode != inode); - curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + curr_index = req->misc.write.in.offset >> PAGE_SHIFT; if (idx_from < curr_index + req->num_pages && curr_index <= idx_to) { found = true; @@ -683,11 +683,11 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, * present there. */ int i; - int start_idx = num_read >> PAGE_CACHE_SHIFT; - size_t off = num_read & (PAGE_CACHE_SIZE - 1); + int start_idx = num_read >> PAGE_SHIFT; + size_t off = num_read & (PAGE_SIZE - 1); for (i = start_idx; i < req->num_pages; i++) { - zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE); + zero_user_segment(req->pages[i], off, PAGE_SIZE); off = 0; } } else { @@ -704,7 +704,7 @@ static int fuse_do_readpage(struct file *file, struct page *page) struct fuse_req *req; size_t num_read; loff_t pos = page_offset(page); - size_t count = PAGE_CACHE_SIZE; + size_t count = PAGE_SIZE; u64 attr_ver; int err; @@ -789,7 +789,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) else SetPageError(page); unlock_page(page); - page_cache_release(page); + put_page(page); } if (req->ff) fuse_file_put(req->ff, false); @@ -800,7 +800,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; loff_t pos = page_offset(req->pages[0]); - size_t count = req->num_pages << PAGE_CACHE_SHIFT; + size_t count = req->num_pages << PAGE_SHIFT; req->out.argpages = 1; req->out.page_zeroing = 1; @@ -836,7 +836,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) if (req->num_pages && (req->num_pages == FUSE_MAX_PAGES_PER_REQ || - (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || + (req->num_pages + 1) * PAGE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { int nr_alloc = min_t(unsigned, data->nr_pages, FUSE_MAX_PAGES_PER_REQ); @@ -858,7 +858,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) return -EIO; } - page_cache_get(page); + get_page(page); req->pages[req->num_pages] = page; req->page_descs[req->num_pages].length = PAGE_SIZE; req->num_pages++; @@ -1003,17 +1003,17 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; - if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE) + if (!req->out.h.error && !offset && count >= PAGE_SIZE) SetPageUptodate(page); - if (count > PAGE_CACHE_SIZE - offset) - count -= PAGE_CACHE_SIZE - offset; + if (count > PAGE_SIZE - offset) + count -= PAGE_SIZE - offset; else count = 0; offset = 0; unlock_page(page); - page_cache_release(page); + put_page(page); } return res; @@ -1024,7 +1024,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, struct iov_iter *ii, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(mapping->host); - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; int err; @@ -1034,8 +1034,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, do { size_t tmp; struct page *page; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, + pgoff_t index = pos >> PAGE_SHIFT; + size_t bytes = min_t(size_t, PAGE_SIZE - offset, iov_iter_count(ii)); bytes = min_t(size_t, bytes, fc->max_write - count); @@ -1059,7 +1059,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, iov_iter_advance(ii, tmp); if (!tmp) { unlock_page(page); - page_cache_release(page); + put_page(page); bytes = min(bytes, iov_iter_single_seg_count(ii)); goto again; } @@ -1072,7 +1072,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, count += tmp; pos += tmp; offset += tmp; - if (offset == PAGE_CACHE_SIZE) + if (offset == PAGE_SIZE) offset = 0; if (!fc->big_writes) @@ -1086,8 +1086,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, static inline unsigned fuse_wr_pages(loff_t pos, size_t len) { return min_t(unsigned, - ((pos + len - 1) >> PAGE_CACHE_SHIFT) - - (pos >> PAGE_CACHE_SHIFT) + 1, + ((pos + len - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT) + 1, FUSE_MAX_PAGES_PER_REQ); } @@ -1186,7 +1186,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_DIRECT) { loff_t pos = iocb->ki_pos; - written = generic_file_direct_write(iocb, from, pos); + written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out; @@ -1205,8 +1205,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; invalidate_mapping_pages(file->f_mapping, - pos >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); written += written_buffered; iocb->ki_pos = pos + written_buffered; @@ -1295,7 +1295,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, *nbytesp = nbytes; - return ret; + return ret < 0 ? ret : 0; } static inline int fuse_iter_npages(const struct iov_iter *ii_p) @@ -1315,8 +1315,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; size_t count = iov_iter_count(iter); - pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; - pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; + pgoff_t idx_from = pos >> PAGE_SHIFT; + pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; ssize_t res = 0; struct fuse_req *req; int err = 0; @@ -1466,7 +1466,7 @@ __acquires(fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; - __u64 data_size = req->num_pages * PAGE_CACHE_SIZE; + __u64 data_size = req->num_pages * PAGE_SIZE; if (!fc->connected) goto out_free; @@ -1727,7 +1727,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, list_del(&new_req->writepages_entry); list_for_each_entry(old_req, &fi->writepages, writepages_entry) { BUG_ON(old_req->inode != new_req->inode); - curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT; if (curr_index <= page->index && page->index < curr_index + old_req->num_pages) { found = true; @@ -1742,7 +1742,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, new_req->num_pages = 1; for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { BUG_ON(tmp->inode != new_req->inode); - curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT; + curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; if (tmp->num_pages == 1 && curr_index == page->index) { old_req = tmp; @@ -1799,7 +1799,7 @@ static int fuse_writepages_fill(struct page *page, if (req && req->num_pages && (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || - (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write || + (req->num_pages + 1) * PAGE_SIZE > fc->max_write || data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); data->req = NULL; @@ -1924,7 +1924,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); struct page *page; loff_t fsize; @@ -1938,15 +1938,15 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, fuse_wait_on_page_writeback(mapping->host, page->index); - if (PageUptodate(page) || len == PAGE_CACHE_SIZE) + if (PageUptodate(page) || len == PAGE_SIZE) goto success; /* * Check if the start this page comes after the end of file, in which * case the readpage can be optimized away. */ fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_CACHE_MASK)) { - size_t off = pos & ~PAGE_CACHE_MASK; + if (fsize <= (pos & PAGE_MASK)) { + size_t off = pos & ~PAGE_MASK; if (off) zero_user_segment(page, 0, off); goto success; @@ -1960,7 +1960,7 @@ success: cleanup: unlock_page(page); - page_cache_release(page); + put_page(page); error: return err; } @@ -1973,16 +1973,16 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, if (!PageUptodate(page)) { /* Zero any unwritten bytes at the end of the page */ - size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK; + size_t endoff = (pos + copied) & ~PAGE_MASK; if (endoff) - zero_user_segment(page, endoff, PAGE_CACHE_SIZE); + zero_user_segment(page, endoff, PAGE_SIZE); SetPageUptodate(page); } fuse_write_update_size(inode, pos + copied); set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); return copied; } @@ -2837,7 +2837,7 @@ static inline loff_t fuse_round_up(loff_t off) } static ssize_t -fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { DECLARE_COMPLETION_ONSTACK(wait); ssize_t ret = 0; @@ -2848,6 +2848,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct inode *inode; loff_t i_size; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; bool is_sync = is_sync_kiocb(iocb); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4d69d5c0bedc..1ce67668a8e1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -339,11 +339,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, fuse_invalidate_attr(inode); if (offset >= 0) { - pg_start = offset >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; if (len <= 0) pg_end = -1; else - pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + pg_end = (offset + len - 1) >> PAGE_SHIFT; invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } @@ -864,7 +864,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) process_init_limits(fc, arg); if (arg->minor >= 6) { - ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; + ra_pages = arg->max_readahead / PAGE_SIZE; if (arg->flags & FUSE_ASYNC_READ) fc->async_read = 1; if (!(arg->flags & FUSE_POSIX_LOCKS)) @@ -901,7 +901,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; } else { - ra_pages = fc->max_read / PAGE_CACHE_SIZE; + ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } @@ -922,7 +922,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; + arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -955,7 +955,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) int err; fc->bdi.name = "fuse"; - fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; /* fuse does it's own writeback accounting */ fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; @@ -1053,8 +1053,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err; #endif } else { - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; } sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 791932617d1a..363ba9e9d8d0 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -24,6 +24,7 @@ #include "glock.h" #include "inode.h" #include "meta_io.h" +#include "rgrp.h" #include "trans.h" #include "util.h" @@ -38,7 +39,7 @@ static const char *gfs2_acl_name(int type) return NULL; } -struct posix_acl *gfs2_get_acl(struct inode *inode, int type) +static struct posix_acl *__gfs2_get_acl(struct inode *inode, int type) { struct gfs2_inode *ip = GFS2_I(inode); struct posix_acl *acl; @@ -50,29 +51,41 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) return NULL; name = gfs2_acl_name(type); - if (name == NULL) - return ERR_PTR(-EINVAL); - len = gfs2_xattr_acl_get(ip, name, &data); - if (len < 0) + if (len <= 0) return ERR_PTR(len); - if (len == 0) - return NULL; - acl = posix_acl_from_xattr(&init_user_ns, data, len); kfree(data); return acl; } -int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + bool need_unlock = false; + struct posix_acl *acl; + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + int ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_ANY, &gh); + if (ret) + return ERR_PTR(ret); + need_unlock = true; + } + acl = __gfs2_get_acl(inode, type); + if (need_unlock) + gfs2_glock_dq_uninit(&gh); + return acl; +} + +int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; int len; char *data; const char *name = gfs2_acl_name(type); - BUG_ON(name == NULL); - if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) return -E2BIG; @@ -115,3 +128,26 @@ out: kfree(data); return error; } + +int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + bool need_unlock = false; + int ret; + + ret = gfs2_rsqa_alloc(ip); + if (ret) + return ret; + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + return ret; + need_unlock = true; + } + ret = __gfs2_set_acl(inode, acl, type); + if (need_unlock) + gfs2_glock_dq_uninit(&gh); + return ret; +} diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 3af4f407a483..f674fdd22337 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -15,6 +15,7 @@ #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); +extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index aa016e4b8bec..37b7bc14c8da 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -101,7 +101,7 @@ static int gfs2_writepage_common(struct page *page, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + pgoff_t end_index = i_size >> PAGE_SHIFT; unsigned offset; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) @@ -109,9 +109,9 @@ static int gfs2_writepage_common(struct page *page, if (current->journal_info) goto redirty; /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); + offset = i_size & (PAGE_SIZE-1); if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); goto out; } return 1; @@ -238,7 +238,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); + unsigned nrblocks = nr_pages * (PAGE_SIZE/inode->i_sb->s_blocksize); int i; int ret; @@ -366,8 +366,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -458,7 +458,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) * so we need to supply one here. It doesn't happen often. */ if (unlikely(page->index)) { - zero_user(page, 0, PAGE_CACHE_SIZE); + zero_user(page, 0, PAGE_SIZE); SetPageUptodate(page); return 0; } @@ -471,7 +471,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); - memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); + memset(kaddr + dsize, 0, PAGE_SIZE - dsize); kunmap_atomic(kaddr); flush_dcache_page(page); brelse(dibh); @@ -560,8 +560,8 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, unsigned size) { struct address_space *mapping = ip->i_inode.i_mapping; - unsigned long index = *pos / PAGE_CACHE_SIZE; - unsigned offset = *pos & (PAGE_CACHE_SIZE - 1); + unsigned long index = *pos / PAGE_SIZE; + unsigned offset = *pos & (PAGE_SIZE - 1); unsigned copied = 0; unsigned amt; struct page *page; @@ -569,15 +569,15 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, do { amt = size - copied; - if (offset + size > PAGE_CACHE_SIZE) - amt = PAGE_CACHE_SIZE - offset; + if (offset + size > PAGE_SIZE) + amt = PAGE_SIZE - offset; page = read_cache_page(mapping, index, __gfs2_readpage, NULL); if (IS_ERR(page)) return PTR_ERR(page); p = kmap_atomic(page); memcpy(buf + copied, p + offset, amt); kunmap_atomic(p); - page_cache_release(page); + put_page(page); copied += amt; index++; offset = 0; @@ -651,8 +651,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned requested = 0; int alloc_required; int error = 0; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + pgoff_t index = pos >> PAGE_SHIFT; + unsigned from = pos & (PAGE_SIZE - 1); struct page *page; gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); @@ -697,7 +697,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, rblocks += gfs2_rg_blocks(ip, requested); error = gfs2_trans_begin(sdp, rblocks, - PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + PAGE_SIZE/sdp->sd_sb.sb_bsize); if (error) goto out_trans_fail; @@ -727,7 +727,7 @@ out: return 0; unlock_page(page); - page_cache_release(page); + put_page(page); gfs2_trans_end(sdp); if (pos + len > ip->i_inode.i_size) @@ -827,7 +827,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, if (!PageUptodate(page)) SetPageUptodate(page); unlock_page(page); - page_cache_release(page); + put_page(page); if (copied) { if (inode->i_size < to) @@ -877,7 +877,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; - unsigned int from = pos & (PAGE_CACHE_SIZE - 1); + unsigned int from = pos & (PAGE_SIZE - 1); unsigned int to = from + len; int ret; struct gfs2_trans *tr = current->journal_info; @@ -888,7 +888,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, ret = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(ret)) { unlock_page(page); - page_cache_release(page); + put_page(page); goto failed; } @@ -977,7 +977,7 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) list_del_init(&bd->bd_list); else - gfs2_remove_from_journal(bh, current->journal_info, 0); + gfs2_remove_from_journal(bh, REMOVE_JDATA); } bh->b_bdev = NULL; clear_buffer_mapped(bh); @@ -992,7 +992,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset, { struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); unsigned int stop = offset + length; - int partial_page = (offset || length < PAGE_CACHE_SIZE); + int partial_page = (offset || length < PAGE_SIZE); struct buffer_head *bh, *head; unsigned long pos = 0; @@ -1042,13 +1042,13 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset) -static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); + loff_t offset = iocb->ki_pos; struct gfs2_holder gh; int rv; @@ -1063,7 +1063,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); rv = gfs2_glock_nq(&gh); if (rv) - return rv; + goto out_uninit; rv = gfs2_ok_for_dio(ip, offset); if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ @@ -1082,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * the first place, mapping->nr_pages will always be zero. */ if (mapping->nrpages) { - loff_t lstart = offset & ~(PAGE_CACHE_SIZE - 1); + loff_t lstart = offset & ~(PAGE_SIZE - 1); loff_t len = iov_iter_count(iter); loff_t end = PAGE_ALIGN(offset + len) - 1; @@ -1099,9 +1099,10 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, gfs2_get_block_direct, NULL, NULL, 0); + gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); +out_uninit: gfs2_holder_uninit(&gh); return rv; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0860f0b5b3f1..24ce1cdd434a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -75,7 +75,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, dsize = dibh->b_size - sizeof(struct gfs2_dinode); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); - memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); + memset(kaddr + dsize, 0, PAGE_SIZE - dsize); kunmap(page); SetPageUptodate(page); @@ -98,7 +98,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (release) { unlock_page(page); - page_cache_release(page); + put_page(page); } return 0; @@ -932,8 +932,8 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) { struct inode *inode = mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - unsigned long index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned long index = from >> PAGE_SHIFT; + unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize, iblock, length, pos; struct buffer_head *bh; struct page *page; @@ -945,7 +945,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -989,7 +989,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) mark_buffer_dirty(bh); unlock: unlock_page(page); - page_cache_release(page); + put_page(page); return err; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c9384f932975..e0f98e483aec 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -160,7 +160,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); error = gfs2_glock_nq(&gh); if (error) - return error; + goto out_uninit; fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) @@ -169,6 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) error = -EFAULT; gfs2_glock_dq(&gh); +out_uninit: gfs2_holder_uninit(&gh); return error; } @@ -354,8 +355,8 @@ static int gfs2_allocate_page_backing(struct page *page) { struct inode *inode = page->mapping->host; struct buffer_head bh; - unsigned long size = PAGE_CACHE_SIZE; - u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + unsigned long size = PAGE_SIZE; + u64 lblock = page->index << (PAGE_SHIFT - inode->i_blkbits); do { bh.b_state = 0; @@ -386,7 +387,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned long last_index; - u64 pos = page->index << PAGE_CACHE_SHIFT; + u64 pos = page->index << PAGE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; loff_t size; @@ -401,7 +402,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out; - gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); + gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -411,7 +412,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { + if (!gfs2_write_alloc_required(ip, pos, PAGE_SIZE)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { ret = -EAGAIN; @@ -424,7 +425,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; - gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_quota_lock_check(ip, &ap); if (ret) @@ -447,7 +448,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); ret = -EINVAL; size = i_size_read(inode); - last_index = (size - 1) >> PAGE_CACHE_SHIFT; + last_index = (size - 1) >> PAGE_SHIFT; /* Check page index against inode size */ if (size == 0 || (page->index > last_index)) goto out_trans_end; @@ -873,7 +874,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t rblocks += data_blocks ? data_blocks : 1; error = gfs2_trans_begin(sdp, rblocks, - PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + PAGE_SIZE/sdp->sd_sb.sb_bsize); if (error) goto out_trans_fail; @@ -895,7 +896,10 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t mark_inode_dirty(inode); } - return generic_write_sync(file, pos, count); + if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) + return vfs_fsync_range(file, pos, pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); + return 0; out_trans_fail: gfs2_inplace_release(ip); @@ -950,6 +954,30 @@ out_uninit: return ret; } +static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct inode *inode = in->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + inode_lock(inode); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) { + inode_unlock(inode); + return ret; + } + + gfs2_glock_dq_uninit(&gh); + inode_unlock(inode); + + return generic_file_splice_read(in, ppos, pipe, len, flags); +} + + static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) @@ -1112,14 +1140,14 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = generic_file_splice_read, + .splice_read = gfs2_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops = { - .iterate = gfs2_readdir, + .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, @@ -1140,14 +1168,14 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = generic_file_splice_read, + .splice_read = gfs2_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops_nolock = { - .iterate = gfs2_readdir, + .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6539131c52a2..706fd9352f36 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -218,7 +218,7 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) * */ -static inline void do_error(struct gfs2_glock *gl, const int ret) +static void do_error(struct gfs2_glock *gl, const int ret) { struct gfs2_holder *gh, *tmp; @@ -475,7 +475,14 @@ __acquires(&gl->gl_lockref.lock) if (sdp->sd_lockstruct.ls_ops->lm_lock) { /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); - if (ret) { + if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED && + test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) { + finish_xmote(gl, target); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + } + else if (ret) { pr_err("lm_lock ret %d\n", ret); GLOCK_BUG_ON(gl, 1); } @@ -1913,7 +1920,7 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti); + ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); } return ret; } @@ -1941,7 +1948,7 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti); + ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); } return ret; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 437fd73e381e..5db59d444838 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -286,17 +286,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) static int inode_go_demote_ok(const struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct gfs2_holder *gh; if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) return 0; - if (!list_empty(&gl->gl_holders)) { - gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); - if (gh->gh_list.next != &gl->gl_holders) - return 0; - } - return 1; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index bb30f9a72c65..21dc784f66c2 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -93,12 +93,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, int error; inode = iget_locked(sb, (unsigned long)no_addr); - ip = GFS2_I(inode); - ip->i_no_addr = no_addr; - if (!inode) return ERR_PTR(-ENOMEM); + ip = GFS2_I(inode); + ip->i_no_addr = no_addr; + if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); ip->i_no_formal_ino = no_formal_ino; @@ -692,12 +692,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, considered free. Any failures need to undo the gfs2 structures. */ if (default_acl) { - error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } @@ -1948,67 +1948,6 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int gfs2_setxattr(struct dentry *dentry, const char *name, - const void *data, size_t size, int flags) -{ - struct inode *inode = d_inode(dentry); - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (ret == 0) { - ret = gfs2_rsqa_alloc(ip); - if (ret == 0) - ret = generic_setxattr(dentry, name, data, size, flags); - gfs2_glock_dq(&gh); - } - gfs2_holder_uninit(&gh); - return ret; -} - -static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, - void *data, size_t size) -{ - struct inode *inode = d_inode(dentry); - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - /* For selinux during lookup */ - if (gfs2_glock_is_locked_by_me(ip->i_gl)) - return generic_getxattr(dentry, name, data, size); - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - ret = gfs2_glock_nq(&gh); - if (ret == 0) { - ret = generic_getxattr(dentry, name, data, size); - gfs2_glock_dq(&gh); - } - gfs2_holder_uninit(&gh); - return ret; -} - -static int gfs2_removexattr(struct dentry *dentry, const char *name) -{ - struct inode *inode = d_inode(dentry); - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (ret == 0) { - ret = gfs2_rsqa_alloc(ip); - if (ret == 0) - ret = generic_removexattr(dentry, name); - gfs2_glock_dq(&gh); - } - gfs2_holder_uninit(&gh); - return ret; -} - static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -2055,10 +1994,10 @@ const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, + .removexattr = generic_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, @@ -2077,10 +2016,10 @@ const struct inode_operations gfs2_dir_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, + .removexattr = generic_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, @@ -2093,10 +2032,10 @@ const struct inode_operations gfs2_symlink_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, + .removexattr = generic_removexattr, .fiemap = gfs2_fiemap, }; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index e137d96f1b17..8eaadabbc771 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -124,7 +124,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) if (mapping == NULL) mapping = &sdp->sd_aspace; - shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; + shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ @@ -154,7 +154,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) map_bh(bh, sdp->sd_vfs, blkno); unlock_page(page); - page_cache_release(page); + put_page(page); return bh; } @@ -325,18 +325,19 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } -void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) +void gfs2_remove_from_journal(struct buffer_head *bh, int meta) { struct address_space *mapping = bh->b_page->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + struct gfs2_trans *tr = current->journal_info; int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_list); - if (meta) + if (meta == REMOVE_META) tr->tr_num_buf_rm++; else tr->tr_num_databuf_rm++; @@ -376,7 +377,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) if (bh) { lock_buffer(bh); gfs2_log_lock(sdp); - gfs2_remove_from_journal(bh, current->journal_info, 1); + gfs2_remove_from_journal(bh, REMOVE_META); gfs2_log_unlock(sdp); unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index c5086c8af5ed..ffdf6aa3509d 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -57,8 +57,12 @@ extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); -extern void gfs2_remove_from_journal(struct buffer_head *bh, - struct gfs2_trans *tr, int meta); +enum { + REMOVE_JDATA = 0, + REMOVE_META = 1, +}; + +extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head **bhp); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 49b0bff18fe3..45463600fb81 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -824,7 +824,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) * i_mutex on quota files is special. Since this inode is hidden system * file, we are safe to define locking ourselves. */ - lockdep_set_class(&sdp->sd_quota_inode->i_mutex, + lockdep_set_class(&sdp->sd_quota_inode->i_rwsem, &gfs2_quota_imutex_key); error = gfs2_rindex_update(sdp); @@ -1360,7 +1360,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - d_inode(path.dentry)->i_sb->s_bdev); + path.dentry->d_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a39891344259..ce7d69a2fdc0 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -701,7 +701,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, unsigned to_write = bytes, pg_off = off; int done = 0; - blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift); + blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); boff = off % bsize; page = find_or_create_page(mapping, index, GFP_NOFS); @@ -753,13 +753,13 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, flush_dcache_page(page); kunmap_atomic(kaddr); unlock_page(page); - page_cache_release(page); + put_page(page); return 0; unlock_out: unlock_page(page); - page_cache_release(page); + put_page(page); return -EIO; } @@ -773,13 +773,13 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, nbytes = sizeof(struct gfs2_quota); - pg_beg = loc >> PAGE_CACHE_SHIFT; - pg_off = loc % PAGE_CACHE_SIZE; + pg_beg = loc >> PAGE_SHIFT; + pg_off = loc % PAGE_SIZE; /* If the quota straddles a page boundary, split the write in two */ - if ((pg_off + nbytes) > PAGE_CACHE_SIZE) { + if ((pg_off + nbytes) > PAGE_SIZE) { pg_oflow = 1; - overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE; + overflow = (pg_off + nbytes) - PAGE_SIZE; } ptr = qp; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 07c0265aa195..5bd216901e89 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -73,8 +73,7 @@ static const char valid_change[16] = { }; static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap, - const struct gfs2_alloc_parms *ap); + const struct gfs2_inode *ip, bool nowrap); /** @@ -918,9 +917,8 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; - rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK; - rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr + - rgd->rd_length) * bsize) - 1; + rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; + rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -1512,7 +1510,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true); if (ret == 0) { rs->rs_rbm = rbm; rs->rs_free = extlen; @@ -1639,7 +1637,6 @@ fail: * @ip: If set, check for reservations * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. - * @ap: the allocation parameters * * Side effects: * - If looking for free blocks, we set GBF_FULL on each bitmap which @@ -1651,8 +1648,7 @@ fail: */ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap, - const struct gfs2_alloc_parms *ap) + const struct gfs2_inode *ip, bool nowrap) { struct buffer_head *bh; int initial_bii; @@ -1773,7 +1769,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip while (1) { down_write(&sdp->sd_log_flush_lock); error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, - true, NULL); + true); up_write(&sdp->sd_log_flush_lock); if (error == -ENOSPC) break; @@ -2330,12 +2326,11 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, int error; gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); if (error == -ENOSPC) { gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, - NULL); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false); } /* Since all blocks are reserved in advance, this shouldn't happen */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index f8a0cd821290..9b2ff353e45f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1176,7 +1176,7 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct super_block *sb = d_inode(dentry)->i_sb; + struct super_block *sb = dentry->d_sb; struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_statfs_change_host sc; int error; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index cf645835710f..aee4485ad8a9 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -68,6 +68,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) fs_err(sdp, "telling LM to unmount\n"); lm->lm_unmount(sdp); } + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); fs_err(sdp, "withdrawn\n"); dump_stack(); } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index e8dfb4740c04..f42ab53bd30d 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -583,13 +583,11 @@ out: * * Returns: actual size of data on success, -errno on error */ -static int gfs2_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) +static int __gfs2_xattr_get(struct inode *inode, const char *name, + void *buffer, size_t size, int type) { - struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; - int type = handler->flags; int error; if (!ip->i_eattr) @@ -611,6 +609,29 @@ static int gfs2_xattr_get(const struct xattr_handler *handler, return error; } +static int gfs2_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + bool need_unlock = false; + int ret; + + /* During lookup, SELinux calls this function with the glock locked. */ + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (ret) + return ret; + need_unlock = true; + } + ret = __gfs2_xattr_get(inode, name, buffer, size, handler->flags); + if (need_unlock) + gfs2_glock_dq_uninit(&gh); + return ret; +} + /** * ea_alloc_blk - allocates a new block for extended attributes. * @ip: A pointer to the inode that's getting extended attributes @@ -1233,8 +1254,21 @@ static int gfs2_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - return __gfs2_xattr_set(d_inode(dentry), name, value, - size, flags, handler->flags); + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = gfs2_rsqa_alloc(ip); + if (ret) + return ret; + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + return ret; + ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags); + gfs2_glock_dq_uninit(&gh); + return ret; } static int ea_dealloc_indirect(struct gfs2_inode *ip) diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 8d931b157bbe..064f92f17efc 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -56,10 +56,9 @@ out: return res; } -ssize_t hfs_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size) +ssize_t hfs_getxattr(struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) { - struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 221719eac5de..d77d844b668b 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -278,14 +278,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) mapping = tree->inode->i_mapping; off = (loff_t)cnid * tree->node_size; - block = off >> PAGE_CACHE_SHIFT; - node->page_offset = off & ~PAGE_CACHE_MASK; + block = off >> PAGE_SHIFT; + node->page_offset = off & ~PAGE_MASK; for (i = 0; i < tree->pages_per_bnode; i++) { page = read_mapping_page(mapping, block++, NULL); if (IS_ERR(page)) goto fail; if (PageError(page)) { - page_cache_release(page); + put_page(page); goto fail; } node->page[i] = page; @@ -401,7 +401,7 @@ void hfs_bnode_free(struct hfs_bnode *node) for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) - page_cache_release(node->page[i]); + put_page(node->page[i]); kfree(node); } @@ -429,11 +429,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) pagep = node->page; memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_CACHE_SIZE, (int)tree->node_size)); + min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE); + memset(kmap(*++pagep), 0, PAGE_SIZE); set_page_dirty(*pagep); kunmap(*pagep); } diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 1ab19e660e69..37cdd955eceb 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -116,14 +116,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } tree->node_size_shift = ffs(size) - 1; - tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; kunmap(page); - page_cache_release(page); + put_page(page); return tree; fail_page: - page_cache_release(page); + put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; iput(tree->inode); @@ -257,9 +257,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; - pagep = node->page + (off >> PAGE_CACHE_SHIFT); + pagep = node->page + (off >> PAGE_SHIFT); data = kmap(*pagep); - off &= ~PAGE_CACHE_MASK; + off &= ~PAGE_MASK; idx = 0; for (;;) { @@ -279,7 +279,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } } - if (++off >= PAGE_CACHE_SIZE) { + if (++off >= PAGE_SIZE) { kunmap(*pagep); data = kmap(*++pagep); off = 0; @@ -302,9 +302,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) len = hfs_brec_lenoff(node, 0, &off16); off = off16; off += node->page_offset; - pagep = node->page + (off >> PAGE_CACHE_SHIFT); + pagep = node->page + (off >> PAGE_SHIFT); data = kmap(*pagep); - off &= ~PAGE_CACHE_MASK; + off &= ~PAGE_MASK; } } @@ -348,9 +348,9 @@ void hfs_bmap_free(struct hfs_bnode *node) len = hfs_brec_lenoff(node, 0, &off); } off += node->page_offset + nidx / 8; - page = node->page[off >> PAGE_CACHE_SHIFT]; + page = node->page[off >> PAGE_SHIFT]; data = kmap(page); - off &= ~PAGE_CACHE_MASK; + off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 1eb5d415d434..98cde8ba5dc2 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -240,10 +240,13 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) } } + /* we only need to take spinlock for exclusion with ->release() */ + spin_lock(&HFS_I(dir)->open_dir_lock); list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) { if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) rd->file->f_pos--; } + spin_unlock(&HFS_I(dir)->open_dir_lock); res = hfs_brec_remove(&fd); if (res) diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index e9f2b855f831..163190ecc0d2 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -161,8 +161,14 @@ static int hfs_readdir(struct file *file, struct dir_context *ctx) } file->private_data = rd; rd->file = file; + spin_lock(&HFS_I(inode)->open_dir_lock); list_add(&rd->list, &HFS_I(inode)->open_dir_list); + spin_unlock(&HFS_I(inode)->open_dir_lock); } + /* + * Can be done after the list insertion; exclusion with + * hfs_delete_cat() is provided by directory lock. + */ memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); out: hfs_find_exit(&fd); @@ -173,9 +179,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { - inode_lock(inode); + spin_lock(&HFS_I(inode)->open_dir_lock); list_del(&rd->list); - inode_unlock(inode); + spin_unlock(&HFS_I(inode)->open_dir_lock); kfree(rd); } return 0; @@ -303,7 +309,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, const struct file_operations hfs_dir_operations = { .read = generic_read_dir, - .iterate = hfs_readdir, + .iterate_shared = hfs_readdir, .llseek = generic_file_llseek, .release = hfs_dir_release, }; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 1f1c7dcbcc2f..fa3eed86837c 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -69,6 +69,7 @@ struct hfs_inode_info { struct hfs_cat_key cat_key; struct list_head open_dir_list; + spinlock_t open_dir_lock; struct inode *rsrc_inode; struct mutex extents_lock; @@ -213,8 +214,8 @@ extern void hfs_delete_inode(struct inode *); /* attr.c */ extern int hfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -extern ssize_t hfs_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size); +extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size); extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* mdb.c */ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 6686bf39a5b5..8eed66af5b82 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -91,8 +91,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask) if (!tree) return 0; - if (tree->node_size >= PAGE_CACHE_SIZE) { - nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); + if (tree->node_size >= PAGE_SIZE) { + nidx = page->index >> (tree->node_size_shift - PAGE_SHIFT); spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, nidx); if (!node) @@ -105,8 +105,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask) } spin_unlock(&tree->hash_lock); } else { - nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift); - i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); + nidx = page->index << (PAGE_SHIFT - tree->node_size_shift); + i = 1 << (PAGE_SHIFT - tree->node_size_shift); spin_lock(&tree->hash_lock); do { node = hfs_bnode_findhash(tree, nidx++); @@ -124,8 +124,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -133,7 +132,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block); /* * In case of error extending write may have instantiated a few @@ -141,7 +140,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if (end > isize) hfs_write_failed(mapping, end); @@ -187,6 +186,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode) mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); + spin_lock_init(&HFS_I(inode)->open_dir_lock); hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); inode->i_ino = HFS_SB(sb)->next_id++; inode->i_mode = mode; @@ -318,6 +318,7 @@ static int hfs_read_inode(struct inode *inode, void *data) HFS_I(inode)->rsrc_inode = NULL; mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); + spin_lock_init(&HFS_I(inode)->open_dir_lock); /* Initialize the inode */ inode->i_uid = hsb->s_uid; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index d2954451519e..c0ae274c0a22 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -13,7 +13,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" -#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8) +#define PAGE_CACHE_BITS (PAGE_SIZE * 8) int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 63924662aaf3..ce014ceb89ef 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -24,16 +24,16 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int l; off += node->page_offset; - pagep = node->page + (off >> PAGE_CACHE_SHIFT); - off &= ~PAGE_CACHE_MASK; + pagep = node->page + (off >> PAGE_SHIFT); + off &= ~PAGE_MASK; - l = min_t(int, len, PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_SIZE - off); memcpy(buf, kmap(*pagep) + off, l); kunmap(*pagep); while ((len -= l) != 0) { buf += l; - l = min_t(int, len, PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_SIZE); memcpy(buf, kmap(*++pagep), l); kunmap(*pagep); } @@ -77,17 +77,17 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) int l; off += node->page_offset; - pagep = node->page + (off >> PAGE_CACHE_SHIFT); - off &= ~PAGE_CACHE_MASK; + pagep = node->page + (off >> PAGE_SHIFT); + off &= ~PAGE_MASK; - l = min_t(int, len, PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_SIZE - off); memcpy(kmap(*pagep) + off, buf, l); set_page_dirty(*pagep); kunmap(*pagep); while ((len -= l) != 0) { buf += l; - l = min_t(int, len, PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_SIZE); memcpy(kmap(*++pagep), buf, l); set_page_dirty(*pagep); kunmap(*pagep); @@ -107,16 +107,16 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) int l; off += node->page_offset; - pagep = node->page + (off >> PAGE_CACHE_SHIFT); - off &= ~PAGE_CACHE_MASK; + pagep = node->page + (off >> PAGE_SHIFT); + off &= ~PAGE_MASK; - l = min_t(int, len, PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_SIZE - off); memset(kmap(*pagep) + off, 0, l); set_page_dirty(*pagep); kunmap(*pagep); while ((len -= l) != 0) { - l = min_t(int, len, PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_SIZE); memset(kmap(*++pagep), 0, l); set_page_dirty(*pagep); kunmap(*pagep); @@ -136,20 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; - src_page = src_node->page + (src >> PAGE_CACHE_SHIFT); - src &= ~PAGE_CACHE_MASK; - dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT); - dst &= ~PAGE_CACHE_MASK; + src_page = src_node->page + (src >> PAGE_SHIFT); + src &= ~PAGE_MASK; + dst_page = dst_node->page + (dst >> PAGE_SHIFT); + dst &= ~PAGE_MASK; if (src == dst) { - l = min_t(int, len, PAGE_CACHE_SIZE - src); + l = min_t(int, len, PAGE_SIZE - src); memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); kunmap(*src_page); set_page_dirty(*dst_page); kunmap(*dst_page); while ((len -= l) != 0) { - l = min_t(int, len, PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_SIZE); memcpy(kmap(*++dst_page), kmap(*++src_page), l); kunmap(*src_page); set_page_dirty(*dst_page); @@ -161,12 +161,12 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, do { src_ptr = kmap(*src_page) + src; dst_ptr = kmap(*dst_page) + dst; - if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) { - l = PAGE_CACHE_SIZE - src; + if (PAGE_SIZE - src < PAGE_SIZE - dst) { + l = PAGE_SIZE - src; src = 0; dst += l; } else { - l = PAGE_CACHE_SIZE - dst; + l = PAGE_SIZE - dst; src += l; dst = 0; } @@ -195,11 +195,11 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) dst += node->page_offset; if (dst > src) { src += len - 1; - src_page = node->page + (src >> PAGE_CACHE_SHIFT); - src = (src & ~PAGE_CACHE_MASK) + 1; + src_page = node->page + (src >> PAGE_SHIFT); + src = (src & ~PAGE_MASK) + 1; dst += len - 1; - dst_page = node->page + (dst >> PAGE_CACHE_SHIFT); - dst = (dst & ~PAGE_CACHE_MASK) + 1; + dst_page = node->page + (dst >> PAGE_SHIFT); + dst = (dst & ~PAGE_MASK) + 1; if (src == dst) { while (src < len) { @@ -208,7 +208,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) set_page_dirty(*dst_page); kunmap(*dst_page); len -= src; - src = PAGE_CACHE_SIZE; + src = PAGE_SIZE; src_page--; dst_page--; } @@ -226,32 +226,32 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) dst_ptr = kmap(*dst_page) + dst; if (src < dst) { l = src; - src = PAGE_CACHE_SIZE; + src = PAGE_SIZE; dst -= l; } else { l = dst; src -= l; - dst = PAGE_CACHE_SIZE; + dst = PAGE_SIZE; } l = min(len, l); memmove(dst_ptr - l, src_ptr - l, l); kunmap(*src_page); set_page_dirty(*dst_page); kunmap(*dst_page); - if (dst == PAGE_CACHE_SIZE) + if (dst == PAGE_SIZE) dst_page--; else src_page--; } while ((len -= l)); } } else { - src_page = node->page + (src >> PAGE_CACHE_SHIFT); - src &= ~PAGE_CACHE_MASK; - dst_page = node->page + (dst >> PAGE_CACHE_SHIFT); - dst &= ~PAGE_CACHE_MASK; + src_page = node->page + (src >> PAGE_SHIFT); + src &= ~PAGE_MASK; + dst_page = node->page + (dst >> PAGE_SHIFT); + dst &= ~PAGE_MASK; if (src == dst) { - l = min_t(int, len, PAGE_CACHE_SIZE - src); + l = min_t(int, len, PAGE_SIZE - src); memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l); kunmap(*src_page); @@ -259,7 +259,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) kunmap(*dst_page); while ((len -= l) != 0) { - l = min_t(int, len, PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_SIZE); memmove(kmap(*++dst_page), kmap(*++src_page), l); kunmap(*src_page); @@ -272,13 +272,13 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) do { src_ptr = kmap(*src_page) + src; dst_ptr = kmap(*dst_page) + dst; - if (PAGE_CACHE_SIZE - src < - PAGE_CACHE_SIZE - dst) { - l = PAGE_CACHE_SIZE - src; + if (PAGE_SIZE - src < + PAGE_SIZE - dst) { + l = PAGE_SIZE - src; src = 0; dst += l; } else { - l = PAGE_CACHE_SIZE - dst; + l = PAGE_SIZE - dst; src += l; dst = 0; } @@ -444,14 +444,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) mapping = tree->inode->i_mapping; off = (loff_t)cnid << tree->node_size_shift; - block = off >> PAGE_CACHE_SHIFT; - node->page_offset = off & ~PAGE_CACHE_MASK; + block = off >> PAGE_SHIFT; + node->page_offset = off & ~PAGE_MASK; for (i = 0; i < tree->pages_per_bnode; block++, i++) { page = read_mapping_page(mapping, block, NULL); if (IS_ERR(page)) goto fail; if (PageError(page)) { - page_cache_release(page); + put_page(page); goto fail; } node->page[i] = page; @@ -569,7 +569,7 @@ void hfs_bnode_free(struct hfs_bnode *node) for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) - page_cache_release(node->page[i]); + put_page(node->page[i]); kfree(node); } @@ -597,11 +597,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) pagep = node->page; memset(kmap(*pagep) + node->page_offset, 0, - min_t(int, PAGE_CACHE_SIZE, tree->node_size)); + min_t(int, PAGE_SIZE, tree->node_size)); set_page_dirty(*pagep); kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE); + memset(kmap(*++pagep), 0, PAGE_SIZE); set_page_dirty(*pagep); kunmap(*pagep); } diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 3345c7553edc..d9d1a36ba826 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -236,15 +236,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = - (tree->node_size + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + (tree->node_size + PAGE_SIZE - 1) >> + PAGE_SHIFT; kunmap(page); - page_cache_release(page); + put_page(page); return tree; fail_page: - page_cache_release(page); + put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; iput(tree->inode); @@ -380,9 +380,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; - pagep = node->page + (off >> PAGE_CACHE_SHIFT); + pagep = node->page + (off >> PAGE_SHIFT); data = kmap(*pagep); - off &= ~PAGE_CACHE_MASK; + off &= ~PAGE_MASK; idx = 0; for (;;) { @@ -403,7 +403,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } } - if (++off >= PAGE_CACHE_SIZE) { + if (++off >= PAGE_SIZE) { kunmap(*pagep); data = kmap(*++pagep); off = 0; @@ -426,9 +426,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) len = hfs_brec_lenoff(node, 0, &off16); off = off16; off += node->page_offset; - pagep = node->page + (off >> PAGE_CACHE_SHIFT); + pagep = node->page + (off >> PAGE_SHIFT); data = kmap(*pagep); - off &= ~PAGE_CACHE_MASK; + off &= ~PAGE_MASK; } } @@ -475,9 +475,9 @@ void hfs_bmap_free(struct hfs_bnode *node) len = hfs_brec_lenoff(node, 0, &off); } off += node->page_offset + nidx / 8; - page = node->page[off >> PAGE_CACHE_SHIFT]; + page = node->page[off >> PAGE_SHIFT]; data = kmap(page); - off &= ~PAGE_CACHE_MASK; + off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 022974ab6e3c..fb707e8f423a 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -374,12 +374,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); } + /* we only need to take spinlock for exclusion with ->release() */ + spin_lock(&HFSPLUS_I(dir)->open_dir_lock); list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) { struct hfsplus_readdir_data *rd = list_entry(pos, struct hfsplus_readdir_data, list); if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) rd->file->f_pos--; } + spin_unlock(&HFSPLUS_I(dir)->open_dir_lock); err = hfs_brec_remove(&fd); if (err) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index a4e867e08947..42e128661dc1 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -271,8 +271,14 @@ next: } file->private_data = rd; rd->file = file; + spin_lock(&HFSPLUS_I(inode)->open_dir_lock); list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); + spin_unlock(&HFSPLUS_I(inode)->open_dir_lock); } + /* + * Can be done after the list insertion; exclusion with + * hfsplus_delete_cat() is provided by directory lock. + */ memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); out: kfree(strbuf); @@ -284,9 +290,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { - inode_lock(inode); + spin_lock(&HFSPLUS_I(inode)->open_dir_lock); list_del(&rd->list); - inode_unlock(inode); + spin_unlock(&HFSPLUS_I(inode)->open_dir_lock); kfree(rd); } return 0; @@ -569,7 +575,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { const struct file_operations hfsplus_dir_operations = { .fsync = hfsplus_file_fsync, .read = generic_read_dir, - .iterate = hfsplus_readdir, + .iterate_shared = hfsplus_readdir, .unlocked_ioctl = hfsplus_ioctl, .llseek = generic_file_llseek, .release = hfsplus_dir_release, diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index f91a1faf819e..fdc3446d934a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -244,6 +244,7 @@ struct hfsplus_inode_info { u8 userflags; /* BSD user file flags */ u32 subfolders; /* Subfolder count (HFSX only) */ struct list_head open_dir_list; + spinlock_t open_dir_lock; loff_t phys_size; struct inode vfs_inode; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 1a6394cdb54e..ef9fefe364a6 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -87,9 +87,9 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) } if (!tree) return 0; - if (tree->node_size >= PAGE_CACHE_SIZE) { + if (tree->node_size >= PAGE_SIZE) { nidx = page->index >> - (tree->node_size_shift - PAGE_CACHE_SHIFT); + (tree->node_size_shift - PAGE_SHIFT); spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, nidx); if (!node) @@ -103,8 +103,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) spin_unlock(&tree->hash_lock); } else { nidx = page->index << - (PAGE_CACHE_SHIFT - tree->node_size_shift); - i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); + (PAGE_SHIFT - tree->node_size_shift); + i = 1 << (PAGE_SHIFT - tree->node_size_shift); spin_lock(&tree->hash_lock); do { node = hfs_bnode_findhash(tree, nidx++); @@ -122,8 +122,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -131,7 +130,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, hfsplus_get_block); /* * In case of error extending write may have instantiated a few @@ -139,7 +138,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if (end > isize) hfsplus_write_failed(mapping, end); @@ -374,6 +373,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) hip = HFSPLUS_I(inode); INIT_LIST_HEAD(&hip->open_dir_list); + spin_lock_init(&hip->open_dir_lock); mutex_init(&hip->extents_lock); atomic_set(&hip->opencnt, 0); hip->extent_state = 0; diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index afb33eda6d7d..ab7ea2506b4d 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -48,9 +48,6 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - return acl; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 5d54490a136d..755bf30ba1ce 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -67,6 +67,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) return inode; INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); mutex_init(&HFSPLUS_I(inode)->extents_lock); HFSPLUS_I(inode)->flags = 0; HFSPLUS_I(inode)->extent_state = 0; @@ -438,7 +439,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = -EFBIG; last_fs_block = sbi->total_blocks - 1; last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index ab01530b4930..4f118d282a7a 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -220,7 +220,7 @@ check_attr_tree_state_again: index = 0; written = 0; - for (; written < node_size; index++, written += PAGE_CACHE_SIZE) { + for (; written < node_size; index++, written += PAGE_SIZE) { void *kaddr; page = read_mapping_page(mapping, index, NULL); @@ -231,11 +231,11 @@ check_attr_tree_state_again: kaddr = kmap_atomic(page); memcpy(kaddr, buf + written, - min_t(size_t, PAGE_CACHE_SIZE, node_size - written)); + min_t(size_t, PAGE_SIZE, node_size - written)); kunmap_atomic(kaddr); set_page_dirty(page); - page_cache_release(page); + put_page(page); } hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY); @@ -579,7 +579,7 @@ failed_getxattr_init: return res; } -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +ssize_t hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size, const char *prefix, size_t prefixlen) { @@ -594,7 +594,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size); + res = __hfsplus_getxattr(inode, xattr_name, value, size); kfree(xattr_name); return res; @@ -844,8 +844,8 @@ end_removexattr: } static int hfsplus_osx_getxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { /* * Don't allow retrieving properly prefixed attributes @@ -860,7 +860,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_getxattr(d_inode(dentry), name, buffer, size); + return __hfsplus_getxattr(inode, name, buffer, size); } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index f9b0955b3d28..d04ba6f58df2 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -28,7 +28,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size); -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +ssize_t hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size, const char *prefix, size_t prefixlen); diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index 72a68a3a0c99..ae2ca8c2e335 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -14,10 +14,10 @@ #include "acl.h" static int hfsplus_security_getxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return hfsplus_getxattr(dentry, name, buffer, size, + return hfsplus_getxattr(inode, name, buffer, size, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index 95a7704c7abb..eae2947060aa 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -12,10 +12,10 @@ #include "xattr.h" static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return hfsplus_getxattr(dentry, name, buffer, size, + return hfsplus_getxattr(inode, name, buffer, size, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 6fc269baf959..3c9eec3e4c7b 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -12,11 +12,11 @@ #include "xattr.h" static int hfsplus_user_getxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return hfsplus_getxattr(dentry, name, buffer, size, + return hfsplus_getxattr(inode, name, buffer, size, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index d1abbee281d1..5c57654927a6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -398,7 +398,7 @@ static const struct file_operations hostfs_file_fops = { static const struct file_operations hostfs_dir_fops = { .llseek = generic_file_llseek, - .iterate = hostfs_readdir, + .iterate_shared = hostfs_readdir, .read = generic_read_dir, .open = hostfs_open, .fsync = hostfs_fsync, @@ -410,12 +410,12 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = mapping->host; char *buffer; loff_t base = page_offset(page); - int count = PAGE_CACHE_SIZE; - int end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int count = PAGE_SIZE; + int end_index = inode->i_size >> PAGE_SHIFT; int err; if (page->index >= end_index) - count = inode->i_size & (PAGE_CACHE_SIZE-1); + count = inode->i_size & (PAGE_SIZE-1); buffer = kmap(page); @@ -447,7 +447,7 @@ static int hostfs_readpage(struct file *file, struct page *page) buffer = kmap(page); bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (bytes_read < 0) { ClearPageUptodate(page); SetPageError(page); @@ -455,7 +455,7 @@ static int hostfs_readpage(struct file *file, struct page *page) goto out; } - memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read); + memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read); ClearPageError(page); SetPageUptodate(page); @@ -471,7 +471,7 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; *pagep = grab_cache_page_write_begin(mapping, index, flags); if (!*pagep) @@ -485,14 +485,14 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; void *buffer; - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned from = pos & (PAGE_SIZE - 1); int err; buffer = kmap(page); err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied); kunmap(page); - if (!PageUptodate(page) && err == PAGE_CACHE_SIZE) + if (!PageUptodate(page) && err == PAGE_SIZE) SetPageUptodate(page); /* @@ -502,7 +502,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, if (err > 0 && (pos > inode->i_size)) inode->i_size = pos; unlock_page(page); - page_cache_release(page); + put_page(page); return err; } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index e57a53c13d86..7b9150c2e75c 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -44,7 +44,11 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) else goto fail; if (pos == 12) goto fail; } - hpfs_add_pos(i, &filp->f_pos); + if (unlikely(hpfs_add_pos(i, &filp->f_pos) < 0)) { + hpfs_unlock(s); + inode_unlock(i); + return -ENOMEM; + } ok: filp->f_pos = new_off; hpfs_unlock(s); @@ -141,8 +145,10 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 1; } if (ctx->pos == 1) { + ret = hpfs_add_pos(inode, &file->f_pos); + if (unlikely(ret < 0)) + goto out; ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; - hpfs_add_pos(inode, &file->f_pos); file->f_version = inode->i_version; } next_pos = ctx->pos; @@ -324,7 +330,7 @@ const struct file_operations hpfs_dir_ops = { .llseek = hpfs_dir_lseek, .read = generic_read_dir, - .iterate = hpfs_readdir, + .iterate_shared = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, .unlocked_ioctl = hpfs_ioctl, diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 2923a7bd82ac..86ab7e790b4e 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -21,7 +21,7 @@ static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde) return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1; } -void hpfs_add_pos(struct inode *inode, loff_t *pos) +int hpfs_add_pos(struct inode *inode, loff_t *pos) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); int i = 0; @@ -29,11 +29,12 @@ void hpfs_add_pos(struct inode *inode, loff_t *pos) if (hpfs_inode->i_rddir_off) for (; hpfs_inode->i_rddir_off[i]; i++) - if (hpfs_inode->i_rddir_off[i] == pos) return; + if (hpfs_inode->i_rddir_off[i] == pos) + return 0; if (!(i&0x0f)) { if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) { pr_err("out of memory for position list\n"); - return; + return -ENOMEM; } if (hpfs_inode->i_rddir_off) { memcpy(ppos, hpfs_inode->i_rddir_off, i * sizeof(loff_t)); @@ -43,6 +44,7 @@ void hpfs_add_pos(struct inode *inode, loff_t *pos) } hpfs_inode->i_rddir_off[i] = pos; hpfs_inode->i_rddir_off[i + 1] = NULL; + return 0; } void hpfs_del_pos(struct inode *inode, loff_t *pos) diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 975654a63c13..aebb78f9e47f 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -242,7 +242,7 @@ extern const struct file_operations hpfs_dir_ops; /* dnode.c */ -void hpfs_add_pos(struct inode *, loff_t *); +int hpfs_add_pos(struct inode *, loff_t *); void hpfs_del_pos(struct inode *, loff_t *); struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, const unsigned char *, unsigned, secno); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e1f465a389d5..4ea71eba40a5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -213,12 +213,12 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, int i, chunksize; /* Find which 4k chunk and offset with in that chunk */ - i = offset >> PAGE_CACHE_SHIFT; - offset = offset & ~PAGE_CACHE_MASK; + i = offset >> PAGE_SHIFT; + offset = offset & ~PAGE_MASK; while (size) { size_t n; - chunksize = PAGE_CACHE_SIZE; + chunksize = PAGE_SIZE; if (offset) chunksize -= offset; if (chunksize > size) @@ -237,7 +237,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the * data. Its *very* similar to do_generic_mapping_read(), we can't use that - * since it has PAGE_CACHE_SIZE assumptions. + * since it has PAGE_SIZE assumptions. */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -285,7 +285,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) * We have the page, copy it to user space buffer. */ copied = hugetlbfs_read_actor(page, offset, to, nr); - page_cache_release(page); + put_page(page); } offset += copied; retval += copied; diff --git a/fs/inode.c b/fs/inode.c index 69b8b526c194..4ccbc21b30ce 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -151,6 +151,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; + inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; @@ -165,8 +166,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); - mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + init_rwsem(&inode->i_rwsem); + lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); @@ -238,9 +239,9 @@ void __destroy_inode(struct inode *inode) } #ifdef CONFIG_FS_POSIX_ACL - if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) + if (inode->i_acl && !is_uncached_acl(inode->i_acl)) posix_acl_release(inode->i_acl); - if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) + if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); #endif this_cpu_dec(nr_inodes); @@ -924,13 +925,13 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode) struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ - if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) { + if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ - mutex_destroy(&inode->i_mutex); - mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, + // mutex_destroy(&inode->i_mutex); + init_rwsem(&inode->i_rwsem); + lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); } } diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index f311bf084015..2e4e834d1a98 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -26,7 +26,7 @@ #include "zisofs.h" /* This should probably be global. */ -static char zisofs_sink_page[PAGE_CACHE_SIZE]; +static char zisofs_sink_page[PAGE_SIZE]; /* * This contains the zlib memory allocation and the mutex for the @@ -70,11 +70,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, for ( i = 0 ; i < pcount ; i++ ) { if (!pages[i]) continue; - memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE); + memset(page_address(pages[i]), 0, PAGE_SIZE); flush_dcache_page(pages[i]); SetPageUptodate(pages[i]); } - return ((loff_t)pcount) << PAGE_CACHE_SHIFT; + return ((loff_t)pcount) << PAGE_SHIFT; } /* Because zlib is not thread-safe, do all the I/O at the top. */ @@ -121,11 +121,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, if (pages[curpage]) { stream.next_out = page_address(pages[curpage]) + poffset; - stream.avail_out = PAGE_CACHE_SIZE - poffset; + stream.avail_out = PAGE_SIZE - poffset; poffset = 0; } else { stream.next_out = (void *)&zisofs_sink_page; - stream.avail_out = PAGE_CACHE_SIZE; + stream.avail_out = PAGE_SIZE; } } if (!stream.avail_in) { @@ -220,14 +220,14 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, * pages with the data we have anyway... */ start_off = page_offset(pages[full_page]); - end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size); + end_off = min_t(loff_t, start_off + PAGE_SIZE, inode->i_size); cstart_block = start_off >> zisofs_block_shift; cend_block = (end_off + (1 << zisofs_block_shift) - 1) >> zisofs_block_shift; - WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) != - ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK)); + WARN_ON(start_off - (full_page << PAGE_SHIFT) != + ((cstart_block << zisofs_block_shift) & PAGE_MASK)); /* Find the pointer to this specific chunk */ /* Note: we're not using isonum_731() here because the data is known aligned */ @@ -260,10 +260,10 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, ret = zisofs_uncompress_block(inode, block_start, block_end, pcount, pages, poffset, &err); poffset += ret; - pages += poffset >> PAGE_CACHE_SHIFT; - pcount -= poffset >> PAGE_CACHE_SHIFT; - full_page -= poffset >> PAGE_CACHE_SHIFT; - poffset &= ~PAGE_CACHE_MASK; + pages += poffset >> PAGE_SHIFT; + pcount -= poffset >> PAGE_SHIFT; + full_page -= poffset >> PAGE_SHIFT; + poffset &= ~PAGE_MASK; if (err) { brelse(bh); @@ -282,7 +282,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, if (poffset && *pages) { memset(page_address(*pages) + poffset, 0, - PAGE_CACHE_SIZE - poffset); + PAGE_SIZE - poffset); flush_dcache_page(*pages); SetPageUptodate(*pages); } @@ -302,12 +302,12 @@ static int zisofs_readpage(struct file *file, struct page *page) int i, pcount, full_page; unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; unsigned int zisofs_pages_per_cblock = - PAGE_CACHE_SHIFT <= zisofs_block_shift ? - (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0; + PAGE_SHIFT <= zisofs_block_shift ? + (1 << (zisofs_block_shift - PAGE_SHIFT)) : 0; struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)]; pgoff_t index = page->index, end_index; - end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; /* * If this page is wholly outside i_size we just return zero; * do_generic_file_read() will handle this for us @@ -318,7 +318,7 @@ static int zisofs_readpage(struct file *file, struct page *page) return 0; } - if (PAGE_CACHE_SHIFT <= zisofs_block_shift) { + if (PAGE_SHIFT <= zisofs_block_shift) { /* We have already been given one page, this is the one we must do. */ full_page = index & (zisofs_pages_per_cblock - 1); @@ -351,7 +351,7 @@ static int zisofs_readpage(struct file *file, struct page *page) kunmap(pages[i]); unlock_page(pages[i]); if (i != full_page) - page_cache_release(pages[i]); + put_page(pages[i]); } } diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index b943cbd963bb..e7599615e4e0 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -58,7 +58,7 @@ int get_acorn_filename(struct iso_directory_record *de, std = sizeof(struct iso_directory_record) + de->name_len[0]; if (std & 1) std++; - if ((*((unsigned char *) de) - std) != 32) + if (de->length[0] - std != 32) return retnamlen; chr = ((unsigned char *) de) + std; if (strncmp(chr, "ARCHIMEDES", 10)) @@ -269,7 +269,7 @@ const struct file_operations isofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = isofs_readdir, + .iterate_shared = isofs_readdir, }; /* diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bcd2d41b318a..131dedc920d8 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1021,7 +1021,7 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock, * the page with useless information without generating any * I/O errors. */ - if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { + if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", __func__, b_off, (unsigned long long)inode->i_size); diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 5384ceb35b1c..98b3eb7d8eaf 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de, int retnamlen = 0; int truncate = 0; int ret = 0; + char *p; + int len; if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; @@ -267,12 +269,17 @@ repeat: rr->u.NM.flags); break; } - if ((strlen(retname) + rr->len - 5) >= 254) { + len = rr->len - 5; + if (retnamlen + len >= 254) { truncate = 1; break; } - strncat(retname, rr->u.NM.name, rr->len - 5); - retnamlen += rr->len - 5; + p = memchr(rr->u.NM.name, '\0', len); + if (unlikely(p)) + len = p - rr->u.NM.name; + memcpy(retname + retnamlen, rr->u.NM.name, len); + retnamlen += len; + retname[retnamlen] = '\0'; break; case SIG('R', 'E'): kfree(rs.buffer); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 517f2de784cf..70078096117d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -81,11 +81,11 @@ static void release_buffer_page(struct buffer_head *bh) if (!trylock_page(page)) goto nope; - page_cache_get(page); + get_page(page); __brelse(bh); try_to_free_buffers(page); unlock_page(page); - page_cache_release(page); + put_page(page); return; nope: @@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WRITE_DATA)) + continue; mapping = jinode->i_vfs_inode->i_mapping; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); @@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WAIT_DATA)) + continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index de73a9516a54..b31852f76f46 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_inode_add_write); +EXPORT_SYMBOL(jbd2_journal_inode_add_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); @@ -2221,7 +2222,7 @@ void jbd2_journal_ack_err(journal_t *journal) int jbd2_journal_blocks_per_page(struct inode *inode) { - return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 08a456b96e4e..805bc6bcd8ab 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -303,7 +303,7 @@ int jbd2_journal_recover(journal_t *journal) * Locate any valid recovery information from the journal and set up the * journal structures in memory to ignore it (presumably because the * caller has evidence that it is out of date). - * This function does'nt appear to be exorted.. + * This function doesn't appear to be exported.. * * We perform one pass over the journal to allow us to tell the user how * much recovery information is being erased, and to let us initialise diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 01e4652d88f6..1749519b362f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -543,7 +543,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved); * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests - * a credit for a number of buffer modications in advance, but can + * a credit for a number of buffer modifications in advance, but can * extend its credit if it needs more. * * jbd2_journal_extend tries to give the running handle more buffer credits. @@ -627,7 +627,7 @@ error_out: * If the jbd2_journal_extend() call above fails to grant new buffer credits * to a running handle, a call to jbd2_journal_restart will commit the * handle's transaction so far and reattach the handle to a new - * transaction capabable of guaranteeing the requested number of + * transaction capable of guaranteeing the requested number of * credits. We preserve reserved handle if there's any attached to the * passed in handle. */ @@ -1586,7 +1586,7 @@ drop: /** * int jbd2_journal_stop() - complete a transaction - * @handle: tranaction to complete. + * @handle: transaction to complete. * * All done for a particular handle. * @@ -2263,7 +2263,7 @@ int jbd2_journal_invalidatepage(journal_t *journal, struct buffer_head *head, *bh, *next; unsigned int stop = offset + length; unsigned int curr_off = 0; - int partial_page = (offset || length < PAGE_CACHE_SIZE); + int partial_page = (offset || length < PAGE_SIZE); int may_free = 1; int ret = 0; @@ -2272,7 +2272,7 @@ int jbd2_journal_invalidatepage(journal_t *journal, if (!page_has_buffers(page)) return 0; - BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + BUG_ON(stop > PAGE_SIZE || stop < length); /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be @@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) /* * File inode in the inode list of the handle's transaction */ -int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, + unsigned long flags) { transaction_t *transaction = handle->h_transaction; journal_t *journal; @@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) * and if jinode->i_next_transaction == transaction, commit code * will only file the inode where we want it. */ - if (jinode->i_transaction == transaction || - jinode->i_next_transaction == transaction) + if ((jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) && + (jinode->i_flags & flags) == flags) return 0; spin_lock(&journal->j_list_lock); - + jinode->i_flags |= flags; + /* Is inode already attached where we need it? */ if (jinode->i_transaction == transaction || jinode->i_next_transaction == transaction) goto done; @@ -2523,6 +2526,17 @@ done: return 0; } +int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) +{ + return jbd2_journal_file_inode(handle, jinode, + JI_WRITE_DATA | JI_WAIT_DATA); +} + +int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) +{ + return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA); +} + /* * File truncate and transaction commit interact with each other in a * non-trivial way. If a transaction writing data block A is diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 2f7a3c090489..bc2693d56298 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -203,8 +203,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type) acl = ERR_PTR(rc); } kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); return acl; } diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c index 1090eb64b90d..9d26b1b9fc01 100644 --- a/fs/jffs2/debug.c +++ b/fs/jffs2/debug.c @@ -95,15 +95,15 @@ __jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f) rather than mucking around with actually reading the node and checking the compression type, which is the real way to tell a hole node. */ - if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag) - && frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) { + if (frag->ofs & (PAGE_SIZE-1) && frag_prev(frag) + && frag_prev(frag)->size < PAGE_SIZE && frag_prev(frag)->node) { JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n", ref_offset(fn->raw)); bitched = 1; } - if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag) - && frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) { + if ((frag->ofs+frag->size) & (PAGE_SIZE-1) && frag_next(frag) + && frag_next(frag)->size < PAGE_SIZE && frag_next(frag)->node) { JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n", ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size); bitched = 1; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 30c4c9ebb693..84c4bf3631a2 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *, const struct file_operations jffs2_dir_operations = { .read = generic_read_dir, - .iterate = jffs2_readdir, + .iterate_shared=jffs2_readdir, .unlocked_ioctl=jffs2_ioctl, .fsync = jffs2_fsync, .llseek = generic_file_llseek, @@ -241,7 +241,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry) { - struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb); + struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_sb); struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); int ret; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index cad86bac3453..0e62dec3effc 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -87,14 +87,15 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) int ret; jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", - __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT); + __func__, inode->i_ino, pg->index << PAGE_SHIFT); BUG_ON(!PageLocked(pg)); pg_buf = kmap(pg); /* FIXME: Can kmap fail? */ - ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE); + ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT, + PAGE_SIZE); if (ret) { ClearPageUptodate(pg); @@ -137,8 +138,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - uint32_t pageofs = index << PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; + uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; pg = grab_cache_page_write_begin(mapping, index, flags); @@ -230,7 +231,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, out_page: unlock_page(pg); - page_cache_release(pg); + put_page(pg); return ret; } @@ -245,14 +246,14 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode *ri; - unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned start = pos & (PAGE_SIZE - 1); unsigned end = start + copied; unsigned aligned_start = start & ~3; int ret = 0; uint32_t writtenlen = 0; jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", - __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT, + __func__, inode->i_ino, pg->index << PAGE_SHIFT, start, end, pg->flags); /* We need to avoid deadlock with page_cache_read() in @@ -261,7 +262,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, to re-lock it. */ BUG_ON(!PageUptodate(pg)); - if (end == PAGE_CACHE_SIZE) { + if (end == PAGE_SIZE) { /* When writing out the end of a page, write out the _whole_ page. This helps to reduce the number of nodes in files which have many short writes, like @@ -275,7 +276,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, jffs2_dbg(1, "%s(): Allocation of raw inode failed\n", __func__); unlock_page(pg); - page_cache_release(pg); + put_page(pg); return -ENOMEM; } @@ -292,7 +293,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, kmap(pg); ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start, - (pg->index << PAGE_CACHE_SHIFT) + aligned_start, + (pg->index << PAGE_SHIFT) + aligned_start, end - aligned_start, &writtenlen); kunmap(pg); @@ -329,6 +330,6 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, jffs2_dbg(1, "%s() returning %d\n", __func__, writtenlen > 0 ? writtenlen : ret); unlock_page(pg); - page_cache_release(pg); + put_page(pg); return writtenlen > 0 ? writtenlen : ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index bead25ae8fe4..ae2ebb26b446 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -586,8 +586,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) goto out_root; sb->s_maxbytes = 0xFFFFFFFF; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = JFFS2_SUPER_MAGIC; if (!(sb->s_flags & MS_RDONLY)) jffs2_start_garbage_collect_thread(c); @@ -685,7 +685,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, struct inode *inode = OFNI_EDONI_2SFFJ(f); struct page *pg; - pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, + pg = read_cache_page(inode->i_mapping, offset >> PAGE_SHIFT, (void *)jffs2_do_readpage_unlock, inode); if (IS_ERR(pg)) return (void *)pg; @@ -701,7 +701,7 @@ void jffs2_gc_release_page(struct jffs2_sb_info *c, struct page *pg = (void *)*priv; kunmap(pg); - page_cache_release(pg); + put_page(pg); } static int jffs2_flash_setup(struct jffs2_sb_info *c) { diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 7e553f286775..9ed0f26cf023 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -552,7 +552,7 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era goto upnout; } /* We found a datanode. Do the GC */ - if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) { + if((start >> PAGE_SHIFT) < ((end-1) >> PAGE_SHIFT)) { /* It crosses a page boundary. Therefore, it must be a hole. */ ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end); } else { @@ -1192,8 +1192,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era struct jffs2_node_frag *frag; uint32_t min, max; - min = start & ~(PAGE_CACHE_SIZE-1); - max = min + PAGE_CACHE_SIZE; + min = start & ~(PAGE_SIZE-1); + max = min + PAGE_SIZE; frag = jffs2_lookup_node_frag(&f->fragtree, start); @@ -1351,7 +1351,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset); datalen = end - offset; - writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1)); + writebuf = pg_ptr + (offset & (PAGE_SIZE -1)); comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen); diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index 9a5449bc3afb..b86c78d178c6 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -90,7 +90,7 @@ uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, /* If the last fragment starts at the RAM page boundary, it is * REF_PRISTINE irrespective of its size. */ - if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) { + if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) { dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n", frag->ofs, frag->ofs + frag->size); frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE; @@ -237,7 +237,7 @@ static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *r If so, both 'this' and the new node get marked REF_NORMAL so the GC can take a look. */ - if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) { + if (lastend && (lastend-1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) { if (this->node) mark_ref_normal(this->node->raw); mark_ref_normal(newfrag->node->raw); @@ -382,7 +382,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in /* If we now share a page with other nodes, mark either previous or next node REF_NORMAL, as appropriate. */ - if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) { + if (newfrag->ofs & (PAGE_SIZE-1)) { struct jffs2_node_frag *prev = frag_prev(newfrag); mark_ref_normal(fn->raw); @@ -391,7 +391,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in mark_ref_normal(prev->node->raw); } - if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) { + if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE-1)) { struct jffs2_node_frag *next = frag_next(newfrag); if (next) { diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 7a28facd7175..3ed9a4b49778 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -49,10 +49,10 @@ int jffs2_init_security(struct inode *inode, struct inode *dir, /* ---- XATTR Handler for "security.*" ----------------- */ static int jffs2_security_getxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, + return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size); } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0a9a114bb9d1..5ef21f4c4c77 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -147,7 +147,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child) JFFS2_DEBUG("Parent of directory ino #%u is #%u\n", f->inocache->ino, pino); - return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino)); + return d_obtain_alias(jffs2_iget(child->d_sb, pino)); } static const struct export_operations jffs2_export_ops = { diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index b634de4c8101..7fb187ab2682 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -172,8 +172,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 beginning of a page and runs to the end of the file, or if it's a hole node, mark it REF_PRISTINE, else REF_NORMAL. */ - if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) || - ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) && + if ((je32_to_cpu(ri->dsize) >= PAGE_SIZE) || + ( ((je32_to_cpu(ri->offset)&(PAGE_SIZE-1))==0) && (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) == je32_to_cpu(ri->isize)))) { flash_ofs |= REF_PRISTINE; } else { @@ -366,7 +366,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, break; } mutex_lock(&f->sem); - datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1))); + datalen = min_t(uint32_t, writelen, + PAGE_SIZE - (offset & (PAGE_SIZE-1))); cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen); comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen); diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index b2555ef07a12..4ebecff1d922 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -17,10 +17,10 @@ #include "nodelist.h" static int jffs2_trusted_getxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, + return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 539bd630b5e4..bce249e1b277 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -17,10 +17,10 @@ #include "nodelist.h" static int jffs2_user_getxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER, + return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size); } diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 49456853e9de..21fa92ba2c19 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -34,10 +34,6 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) int size; char *value = NULL; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch(type) { case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -67,8 +63,6 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) acl = posix_acl_from_xattr(&init_user_ns, value, size); } kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); return acl; } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 4ce7735dd042..7f1a585a0a94 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -140,10 +140,10 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) } const struct inode_operations jfs_file_inode_operations = { - .setxattr = jfs_setxattr, - .getxattr = jfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = jfs_removexattr, + .removexattr = generic_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9d9bae63ae2a..ad3e7b1effc4 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -102,8 +102,8 @@ int jfs_commit_inode(struct inode *inode, int wait) * partitions and may think inode is dirty */ if (!special_file(inode->i_mode) && noisy) { - jfs_err("jfs_commit_inode(0x%p) called on " - "read-only volume", inode); + jfs_err("jfs_commit_inode(0x%p) called on read-only volume", + inode); jfs_err("Is remount racy?"); noisy--; } @@ -332,8 +332,7 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, jfs_get_block); } -static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -341,7 +340,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, jfs_get_block); /* * In case of error extending write may have instantiated a few @@ -349,7 +348,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if (end > isize) jfs_write_failed(mapping, end); diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c index dfcd50304559..f76ff0a46444 100644 --- a/fs/jfs/jfs_discard.c +++ b/fs/jfs/jfs_discard.c @@ -49,14 +49,12 @@ void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks) r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0); if (unlikely(r != 0)) { - jfs_err("JFS: sb_issue_discard" \ - "(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!\n", + jfs_err("JFS: sb_issue_discard(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!", sb, (unsigned long long)blkno, (unsigned long long)nblocks, r); } - jfs_info("JFS: sb_issue_discard" \ - "(%p, %llu, %llu, GFP_NOFS, 0) = %d\n", + jfs_info("JFS: sb_issue_discard(%p, %llu, %llu, GFP_NOFS, 0) = %d", sb, (unsigned long long)blkno, (unsigned long long)nblocks, r); diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index d88576e23fe4..de2bcb36e079 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3072,8 +3072,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { - jfs_err("jfs_readdir detected " - "infinite loop!"); + jfs_err("jfs_readdir detected infinite loop!"); ctx->pos = DIREND; return 0; } @@ -3151,8 +3150,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { - jfs_err("jfs_readdir called with " - "invalid offset!"); + jfs_err("jfs_readdir called with invalid offset!"); } dtoffset->pn = 1; dtoffset->index = 0; @@ -3165,8 +3163,8 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) } if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { - jfs_err("jfs_readdir: unexpected rc = %d " - "from dtReadNext", rc); + jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext", + rc); ctx->pos = DIREND; return 0; } diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index f321986e73d2..6aca224a5d68 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -534,8 +534,7 @@ void diWriteSpecial(struct inode *ip, int secondary) /* read the page of fixed disk inode (AIT) in raw mode */ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); if (mp == NULL) { - jfs_err("diWriteSpecial: failed to read aggregate inode " - "extent!"); + jfs_err("diWriteSpecial: failed to read aggregate inode extent!"); return; } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index cf7936fe2e68..5e33cb9a190d 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -151,7 +151,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_inode->xtlid = 0; jfs_set_inode_flags(inode); - jfs_info("ialloc returns inode = 0x%p\n", inode); + jfs_info("ialloc returns inode = 0x%p", inode); return inode; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a270cb7ff4e0..63759d723920 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1094,7 +1094,7 @@ int lmLogOpen(struct super_block *sb) if (log->bdev->bd_dev == sbi->logdev) { if (memcmp(log->uuid, sbi->loguuid, sizeof(log->uuid))) { - jfs_warn("wrong uuid on JFS journal\n"); + jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); return -EINVAL; } @@ -1333,9 +1333,8 @@ int lmLogInit(struct jfs_log * log) rc = -EINVAL; goto errout20; } - jfs_info("lmLogInit: inline log:0x%p base:0x%Lx " - "size:0x%x", log, - (unsigned long long) log->base, log->size); + jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x", + log, (unsigned long long)log->base, log->size); } else { if (memcmp(logsuper->uuid, log->uuid, 16)) { jfs_warn("wrong uuid on JFS log device"); @@ -1343,9 +1342,8 @@ int lmLogInit(struct jfs_log * log) } log->size = le32_to_cpu(logsuper->size); log->l2bsize = le32_to_cpu(logsuper->l2bsize); - jfs_info("lmLogInit: external log:0x%p base:0x%Lx " - "size:0x%x", log, - (unsigned long long) log->base, log->size); + jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x", + log, (unsigned long long)log->base, log->size); } log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; @@ -2136,7 +2134,7 @@ static void lbmStartIO(struct lbuf * bp) struct bio *bio; struct jfs_log *log = bp->l_log; - jfs_info("lbmStartIO\n"); + jfs_info("lbmStartIO"); bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index a3eb316b1ac3..b60e015cc757 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -80,7 +80,7 @@ static inline void lock_metapage(struct metapage *mp) static struct kmem_cache *metapage_cache; static mempool_t *metapage_mempool; -#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE) +#define MPS_PER_PAGE (PAGE_SIZE >> L2PSIZE) #if MPS_PER_PAGE > 1 @@ -316,7 +316,7 @@ static void last_write_complete(struct page *page) struct metapage *mp; unsigned int offset; - for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = page_to_mp(page, offset); if (mp && test_bit(META_io, &mp->flag)) { if (mp->lsn) @@ -366,12 +366,12 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) int bad_blocks = 0; page_start = (sector_t)page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); + (PAGE_SHIFT - inode->i_blkbits); BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); set_page_writeback(page); - for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = page_to_mp(page, offset); if (!mp || !test_bit(META_dirty, &mp->flag)) @@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) bio = NULL; } else inc_io(page); - xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits; + xlen = (PAGE_SIZE - offset) >> inode->i_blkbits; pblock = metapage_get_blocks(inode, lblock, &xlen); if (!pblock) { printk(KERN_ERR "JFS: metapage_get_blocks failed\n"); @@ -485,7 +485,7 @@ static int metapage_readpage(struct file *fp, struct page *page) struct inode *inode = page->mapping->host; struct bio *bio = NULL; int block_offset; - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; sector_t page_start; /* address of page in fs blocks */ sector_t pblock; int xlen; @@ -494,7 +494,7 @@ static int metapage_readpage(struct file *fp, struct page *page) BUG_ON(!PageLocked(page)); page_start = (sector_t)page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); + (PAGE_SHIFT - inode->i_blkbits); block_offset = 0; while (block_offset < blocks_per_page) { @@ -542,7 +542,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) int ret = 1; int offset; - for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = page_to_mp(page, offset); if (!mp) @@ -568,7 +568,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) static void metapage_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - BUG_ON(offset || length < PAGE_CACHE_SIZE); + BUG_ON(offset || length < PAGE_SIZE); BUG_ON(PageWriteback(page)); @@ -599,10 +599,10 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, inode->i_ino, lblock, absolute); l2bsize = inode->i_blkbits; - l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize; + l2BlocksPerPage = PAGE_SHIFT - l2bsize; page_index = lblock >> l2BlocksPerPage; page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize; - if ((page_offset + size) > PAGE_CACHE_SIZE) { + if ((page_offset + size) > PAGE_SIZE) { jfs_err("MetaData crosses page boundary!!"); jfs_err("lblock = %lx, size = %d", lblock, size); dump_stack(); @@ -621,7 +621,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, mapping = inode->i_mapping; } - if (new && (PSIZE == PAGE_CACHE_SIZE)) { + if (new && (PSIZE == PAGE_SIZE)) { page = grab_cache_page(mapping, page_index); if (!page) { jfs_err("grab_cache_page failed!"); @@ -693,7 +693,7 @@ unlock: void grab_metapage(struct metapage * mp) { jfs_info("grab_metapage: mp = 0x%p", mp); - page_cache_get(mp->page); + get_page(mp->page); lock_page(mp->page); mp->count++; lock_metapage(mp); @@ -706,12 +706,12 @@ void force_metapage(struct metapage *mp) jfs_info("force_metapage: mp = 0x%p", mp); set_bit(META_forcewrite, &mp->flag); clear_bit(META_sync, &mp->flag); - page_cache_get(page); + get_page(page); lock_page(page); set_page_dirty(page); write_one_page(page, 1); clear_bit(META_forcewrite, &mp->flag); - page_cache_release(page); + put_page(page); } void hold_metapage(struct metapage *mp) @@ -726,7 +726,7 @@ void put_metapage(struct metapage *mp) unlock_page(mp->page); return; } - page_cache_get(mp->page); + get_page(mp->page); mp->count++; lock_metapage(mp); unlock_page(mp->page); @@ -746,7 +746,7 @@ void release_metapage(struct metapage * mp) assert(mp->count); if (--mp->count || mp->nohomeok) { unlock_page(page); - page_cache_release(page); + put_page(page); return; } @@ -764,13 +764,13 @@ void release_metapage(struct metapage * mp) drop_metapage(page, mp); unlock_page(page); - page_cache_release(page); + put_page(page); } void __invalidate_metapages(struct inode *ip, s64 addr, int len) { sector_t lblock; - int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits; + int l2BlocksPerPage = PAGE_SHIFT - ip->i_blkbits; int BlocksPerPage = 1 << l2BlocksPerPage; /* All callers are interested in block device's mapping */ struct address_space *mapping = @@ -788,7 +788,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len) page = find_lock_page(mapping, lblock >> l2BlocksPerPage); if (!page) continue; - for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = page_to_mp(page, offset); if (!mp) continue; @@ -803,7 +803,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len) remove_from_logsync(mp); } unlock_page(page); - page_cache_release(page); + put_page(page); } } diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index 337e9e51ac06..a869fb4a20d6 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -106,7 +106,7 @@ static inline void metapage_nohomeok(struct metapage *mp) lock_page(page); if (!mp->nohomeok++) { mark_metapage_dirty(mp); - page_cache_get(page); + get_page(page); wait_on_page_writeback(page); } unlock_page(page); @@ -128,7 +128,7 @@ static inline void metapage_wait_for_io(struct metapage *mp) static inline void _metapage_homeok(struct metapage *mp) { if (!--mp->nohomeok) - page_cache_release(mp->page); + put_page(mp->page); } static inline void metapage_homeok(struct metapage *mp) diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index d595856453b2..eddf2b6eda85 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1764,7 +1764,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, if (lwm == next) goto out; if (lwm > next) { - jfs_err("xtLog: lwm > next\n"); + jfs_err("xtLog: lwm > next"); goto out; } tlck->flag |= tlckUPDATEMAP; @@ -1798,8 +1798,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, xadlock->xdlist = &p->xad[lwm]; tblk->xflag &= ~COMMIT_LAZY; } - jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d " - "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count); + jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d", + tlck->ip, mp, tlck, lwm, xadlock->count); maplock->index = 1; @@ -2025,8 +2025,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, xadlock->count = next - lwm; xadlock->xdlist = &p->xad[lwm]; - jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d " - "lwm:%d next:%d", + jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d", tlck->ip, mp, xadlock->count, lwm, next); maplock->index++; xadlock++; @@ -2047,8 +2046,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, pxdlock->count = 1; pxdlock->pxd = pxd; - jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d " - "hwm:%d", ip, mp, pxdlock->count, hwm); + jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d", + ip, mp, pxdlock->count, hwm); maplock->index++; xadlock++; } @@ -2066,8 +2065,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, xadlock->count = hwm - next + 1; xadlock->xdlist = &p->xad[next]; - jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d " - "next:%d hwm:%d", + jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d", tlck->ip, mp, xadlock->count, next, hwm); maplock->index++; } @@ -2523,8 +2521,7 @@ void txFreeMap(struct inode *ip, xlen = lengthXAD(xad); dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, tblk); - jfs_info("freePMap: xaddr:0x%lx " - "xlen:%d", + jfs_info("freePMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } @@ -2814,7 +2811,7 @@ int jfs_lazycommit(void *arg) if (!list_empty(&TxAnchor.unlock_queue)) jfs_err("jfs_lazycommit being killed w/pending transactions!"); else - jfs_info("jfs_lazycommit being killed\n"); + jfs_info("jfs_lazycommit being killed"); return 0; } diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index e8d717dabca3..561f6af46288 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -19,6 +19,8 @@ #ifndef H_JFS_XATTR #define H_JFS_XATTR +#include <linux/xattr.h> + /* * jfs_ea_list describe the on-disk format of the extended attributes. * I know the null-terminator is redundant since namelen is stored, but @@ -54,12 +56,8 @@ struct jfs_ea_list { extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *, size_t, int); -extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t, - int); extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); -extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); -extern int jfs_removexattr(struct dentry *, const char *); extern const struct xattr_handler *jfs_xattr_handlers[]; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 701f89370de7..539deddecbb0 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1225,8 +1225,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_CREATE); if (rc) { - jfs_err("jfs_rename didn't expect dtSearch to fail " - "w/rc = %d", rc); + jfs_err("jfs_rename didn't expect dtSearch to fail w/rc = %d", + rc); goto out_tx; } @@ -1524,7 +1524,7 @@ struct dentry *jfs_get_parent(struct dentry *dentry) parent_ino = le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot); - return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino)); + return d_obtain_alias(jfs_iget(dentry->d_sb, parent_ino)); } const struct inode_operations jfs_dir_inode_operations = { @@ -1537,10 +1537,10 @@ const struct inode_operations jfs_dir_inode_operations = { .rmdir = jfs_rmdir, .mknod = jfs_mknod, .rename = jfs_rename, - .setxattr = jfs_setxattr, - .getxattr = jfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = jfs_removexattr, + .removexattr = generic_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4f5d85ba8e23..cec8814a3b8b 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -84,7 +84,7 @@ static void jfs_handle_error(struct super_block *sb) panic("JFS (device %s): panic forced after error\n", sb->s_id); else if (sbi->flag & JFS_ERR_REMOUNT_RO) { - jfs_err("ERROR: (device %s): remounting filesystem as read-only\n", + jfs_err("ERROR: (device %s): remounting filesystem as read-only", sb->s_id); sb->s_flags |= MS_RDONLY; } @@ -596,7 +596,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) * Page cache is indexed by long. * I would use MAX_LFS_FILESIZE, but it's only half as big */ - sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, + sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1, (u64)sb->s_maxbytes); #endif sb->s_time_gran = 1; @@ -641,7 +641,7 @@ static int jfs_freeze(struct super_block *sb) } rc = updateSuper(sb, FM_CLEAN); if (rc) { - jfs_err("jfs_freeze: updateSuper failed\n"); + jfs_err("jfs_freeze: updateSuper failed"); /* * Don't fail here. Everything succeeded except * marking the superblock clean, so there's really diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index f8db4fde0b0b..c94c7e4a1323 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -25,19 +25,19 @@ const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, .get_link = simple_get_link, .setattr = jfs_setattr, - .setxattr = jfs_setxattr, - .getxattr = jfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = jfs_removexattr, + .removexattr = generic_removexattr, }; const struct inode_operations jfs_symlink_inode_operations = { .readlink = generic_readlink, .get_link = page_get_link, .setattr = jfs_setattr, - .setxattr = jfs_setxattr, - .getxattr = jfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = jfs_removexattr, + .removexattr = generic_removexattr, }; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 48b15a6e5558..beb182b503b3 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -86,6 +86,14 @@ struct ea_buffer { #define EA_MALLOC 0x0008 +/* + * Mapping of on-disk attribute names: for on-disk attribute names with an + * unknown prefix (not "system.", "user.", "security.", or "trusted."), the + * prefix "os2." is prepended. On the way back to disk, "os2." prefixes are + * stripped and we make sure that the remaining name does not start with one + * of the know prefixes. + */ + static int is_known_namespace(const char *name) { if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && @@ -97,29 +105,19 @@ static int is_known_namespace(const char *name) return true; } -/* - * These three routines are used to recognize on-disk extended attributes - * that are in a recognized namespace. If the attribute is not recognized, - * "os2." is prepended to the name - */ -static int is_os2_xattr(struct jfs_ea *ea) -{ - return !is_known_namespace(ea->name); -} - static inline int name_size(struct jfs_ea *ea) { - if (is_os2_xattr(ea)) - return ea->namelen + XATTR_OS2_PREFIX_LEN; - else + if (is_known_namespace(ea->name)) return ea->namelen; + else + return ea->namelen + XATTR_OS2_PREFIX_LEN; } static inline int copy_name(char *buffer, struct jfs_ea *ea) { int len = ea->namelen; - if (is_os2_xattr(ea)) { + if (!is_known_namespace(ea->name)) { memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN); buffer += XATTR_OS2_PREFIX_LEN; len += XATTR_OS2_PREFIX_LEN; @@ -665,35 +663,6 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, return 0; } -/* - * Most of the permission checking is done by xattr_permission in the vfs. - * We also need to verify that this is a namespace that we recognize. - */ -static int can_set_xattr(struct inode *inode, const char *name, - const void *value, size_t value_len) -{ - if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { - /* - * This makes sure that we aren't trying to set an - * attribute in a different namespace by prefixing it - * with "os2." - */ - if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) - return -EOPNOTSUPP; - return 0; - } - - /* - * Don't allow setting an attribute in an unknown namespace. - */ - if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && - strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && - strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EOPNOTSUPP; - - return 0; -} - int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, const void *value, size_t value_len, int flags) { @@ -704,21 +673,10 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, int xattr_size; int new_size; int namelen = strlen(name); - char *os2name = NULL; int found = 0; int rc; int length; - if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { - os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, - GFP_KERNEL); - if (!os2name) - return -ENOMEM; - strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); - name = os2name; - namelen -= XATTR_OS2_PREFIX_LEN; - } - down_write(&JFS_IP(inode)->xattr_sem); xattr_size = ea_get(inode, &ea_buf, 0); @@ -841,44 +799,6 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, out: up_write(&JFS_IP(inode)->xattr_sem); - kfree(os2name); - - return rc; -} - -int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t value_len, int flags) -{ - struct inode *inode = d_inode(dentry); - struct jfs_inode_info *ji = JFS_IP(inode); - int rc; - tid_t tid; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, value_len, flags); - - if ((rc = can_set_xattr(inode, name, value, value_len))) - return rc; - - if (value == NULL) { /* empty EA, do not remove */ - value = ""; - value_len = 0; - } - - tid = txBegin(inode->i_sb, 0); - mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len, - flags); - if (!rc) - rc = txCommit(tid, 1, &inode, 0); - txEnd(tid); - mutex_unlock(&ji->commit_mutex); - return rc; } @@ -933,37 +853,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, return size; } -ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, - size_t buf_size) -{ - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, data, buf_size); - - if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { - /* - * skip past "os2." prefix - */ - name += XATTR_OS2_PREFIX_LEN; - /* - * Don't allow retrieving properly prefixed attributes - * by prepending them with "os2." - */ - if (is_known_namespace(name)) - return -EOPNOTSUPP; - } - - err = __jfs_getxattr(d_inode(dentry), name, data, buf_size); - - return err; -} - /* * No special permissions are needed to list attributes except for trusted.* */ @@ -1027,27 +916,16 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) return size; } -int jfs_removexattr(struct dentry *dentry, const char *name) +static int __jfs_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct jfs_inode_info *ji = JFS_IP(inode); - int rc; tid_t tid; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); - - if ((rc = can_set_xattr(inode, name, NULL, 0))) - return rc; + int rc; tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); + rc = __jfs_setxattr(tid, inode, name, value, size, flags); if (!rc) rc = txCommit(tid, 1, &inode, 0); txEnd(tid); @@ -1056,15 +934,77 @@ int jfs_removexattr(struct dentry *dentry, const char *name) return rc; } -/* - * List of handlers for synthetic system.* attributes. All real ondisk - * attributes are handled directly. - */ +static int jfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + name = xattr_full_name(handler, name); + return __jfs_getxattr(inode, name, value, size); +} + +static int jfs_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = d_inode(dentry); + + name = xattr_full_name(handler, name); + return __jfs_xattr_set(inode, name, value, size, flags); +} + +static int jfs_xattr_get_os2(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + return __jfs_getxattr(inode, name, value, size); +} + +static int jfs_xattr_set_os2(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = d_inode(dentry); + + if (is_known_namespace(name)) + return -EOPNOTSUPP; + return __jfs_xattr_set(inode, name, value, size, flags); +} + +static const struct xattr_handler jfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = jfs_xattr_get, + .set = jfs_xattr_set, +}; + +static const struct xattr_handler jfs_os2_xattr_handler = { + .prefix = XATTR_OS2_PREFIX, + .get = jfs_xattr_get_os2, + .set = jfs_xattr_set_os2, +}; + +static const struct xattr_handler jfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = jfs_xattr_get, + .set = jfs_xattr_set, +}; + +static const struct xattr_handler jfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = jfs_xattr_get, + .set = jfs_xattr_set, +}; + const struct xattr_handler *jfs_xattr_handlers[] = { #ifdef CONFIG_JFS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, #endif + &jfs_os2_xattr_handler, + &jfs_user_xattr_handler, + &jfs_security_xattr_handler, + &jfs_trusted_xattr_handler, NULL, }; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 03b688d19f69..8a652404eef6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -153,9 +153,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, p = buf + len + nlen; *p = '\0'; for (kn = kn_to; kn != common; kn = kn->parent) { - nlen = strlen(kn->name); - p -= nlen; - memcpy(p, kn->name, nlen); + size_t tmp = strlen(kn->name); + p -= tmp; + memcpy(p, kn->name, tmp); *(--p) = '/'; } @@ -753,7 +753,8 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr = parent->iattr; if (ps_iattr) { struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + ktime_get_real_ts(&ps_iattrs->ia_ctime); + ps_iattrs->ia_mtime = ps_iattrs->ia_ctime; } mutex_unlock(&kernfs_mutex); @@ -1279,8 +1280,9 @@ static void __kernfs_remove(struct kernfs_node *kn) /* update timestamps on the parent */ if (ps_iattr) { - ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; - ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime); + ps_iattr->ia_iattr.ia_mtime = + ps_iattr->ia_iattr.ia_ctime; } kernfs_put(pos); @@ -1643,22 +1645,9 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) return 0; } -static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, - int whence) -{ - struct inode *inode = file_inode(file); - loff_t ret; - - inode_lock(inode); - ret = generic_file_llseek(file, offset, whence); - inode_unlock(inode); - - return ret; -} - const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, - .iterate = kernfs_fop_readdir, + .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, - .llseek = kernfs_dir_fop_llseek, + .llseek = generic_file_llseek, }; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 7247252ee9b1..e1574008adc9 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -190,15 +190,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, char *buf; buf = of->prealloc_buf; - if (!buf) + if (buf) + mutex_lock(&of->prealloc_mutex); + else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that - * the ops aren't called concurrently for the same open file, and - * to provide exclusive access to ->prealloc_buf (when that exists). + * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -214,21 +215,23 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, else len = -EINVAL; + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + if (len < 0) - goto out_unlock; + goto out_free; if (copy_to_user(user_buf, buf, len)) { len = -EFAULT; - goto out_unlock; + goto out_free; } *ppos += len; - out_unlock: - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); out_free: - if (buf != of->prealloc_buf) + if (buf == of->prealloc_buf) + mutex_unlock(&of->prealloc_mutex); + else kfree(buf); return len; } @@ -284,15 +287,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, } buf = of->prealloc_buf; - if (!buf) + if (buf) + mutex_lock(&of->prealloc_mutex); + else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + /* * @of->mutex nests outside active ref and is used both to ensure that - * the ops aren't called concurrently for the same open file, and - * to provide exclusive access to ->prealloc_buf (when that exists). + * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -301,26 +311,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, goto out_free; } - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_unlock; - } - buf[len] = '\0'; /* guarantee string termination */ - ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, *ppos); else len = -EINVAL; + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + if (len > 0) *ppos += len; -out_unlock: - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); out_free: - if (buf != of->prealloc_buf) + if (buf == of->prealloc_buf) + mutex_unlock(&of->prealloc_mutex); + else kfree(buf); return len; } @@ -687,6 +693,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) error = -ENOMEM; if (!of->prealloc_buf) goto err_free; + mutex_init(&of->prealloc_mutex); } /* diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 16405ae88d2d..1719649d7ad7 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -54,7 +54,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_mode = kn->mode; iattrs->ia_uid = GLOBAL_ROOT_UID; iattrs->ia_gid = GLOBAL_ROOT_GID; - iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + ktime_get_real_ts(&iattrs->ia_atime); + iattrs->ia_mtime = iattrs->ia_atime; + iattrs->ia_ctime = iattrs->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); out_unlock: @@ -208,10 +211,10 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name) return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE); } -ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, - size_t size) +ssize_t kernfs_iop_getxattr(struct dentry *unused, struct inode *inode, + const char *name, void *buf, size_t size) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); @@ -236,16 +239,18 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = + inode->i_ctime = current_fs_time(inode->i_sb); } static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) { + struct super_block *sb = inode->i_sb; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; + inode->i_atime = timespec_trunc(iattr->ia_atime, sb->s_time_gran); + inode->i_mtime = timespec_trunc(iattr->ia_mtime, sb->s_time_gran); + inode->i_ctime = timespec_trunc(iattr->ia_ctime, sb->s_time_gran); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 6762bfbd8207..45c9192c276e 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -84,8 +84,8 @@ int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int kernfs_iop_removexattr(struct dentry *dentry, const char *name); -ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, - size_t size); +ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *buf, size_t size); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); /* diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index b67dbccdaf88..63534f5f9073 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/namei.h> +#include <linux/seq_file.h> #include "kernfs-internal.h" @@ -40,6 +41,19 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) return 0; } +static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_node *node = dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(node); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_path) + return scops->show_path(sf, node, root); + + seq_dentry(sf, dentry, " \t\n\\"); + return 0; +} + const struct super_operations kernfs_sops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, @@ -47,6 +61,7 @@ const struct super_operations kernfs_sops = { .remount_fs = kernfs_sop_remount_fs, .show_options = kernfs_sop_show_options, + .show_path = kernfs_sop_show_path, }; /** @@ -120,9 +135,8 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, kntmp = find_next_ancestor(kn, knparent); if (WARN_ON(!kntmp)) return ERR_PTR(-EINVAL); - mutex_lock(&d_inode(dentry)->i_mutex); - dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name)); - mutex_unlock(&d_inode(dentry)->i_mutex); + dtmp = lookup_one_len_unlocked(kntmp->name, dentry, + strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) return dtmp; @@ -138,8 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) struct dentry *root; info->sb = sb; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_time_gran = 1; diff --git a/fs/libfs.c b/fs/libfs.c index 0ca80b2af420..8765ff1adc07 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -25,7 +25,7 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); - stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9); + stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } EXPORT_SYMBOL(simple_getattr); @@ -33,7 +33,7 @@ EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { buf->f_type = dentry->d_sb->s_magic; - buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; return 0; } @@ -89,7 +89,6 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -97,7 +96,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -124,7 +122,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - inode_unlock(d_inode(dentry)); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -190,7 +187,7 @@ const struct file_operations simple_dir_operations = { .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, - .iterate = dcache_readdir, + .iterate_shared = dcache_readdir, .fsync = noop_fsync, }; EXPORT_SYMBOL(simple_dir_operations); @@ -395,7 +392,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping, struct page *page; pgoff_t index; - index = pos >> PAGE_CACHE_SHIFT; + index = pos >> PAGE_SHIFT; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -403,10 +400,10 @@ int simple_write_begin(struct file *file, struct address_space *mapping, *pagep = page; - if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + if (!PageUptodate(page) && (len != PAGE_SIZE)) { + unsigned from = pos & (PAGE_SIZE - 1); - zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE); + zero_user_segments(page, 0, from, from + len, PAGE_SIZE); } return 0; } @@ -442,7 +439,7 @@ int simple_write_end(struct file *file, struct address_space *mapping, /* zero the stale part of the page if we did a short copy */ if (copied < len) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned from = pos & (PAGE_SIZE - 1); zero_user(page, from + copied, len - copied); } @@ -458,7 +455,7 @@ int simple_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); return copied; } @@ -477,8 +474,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic, struct dentry *dentry; int i; - s->s_blocksize = PAGE_CACHE_SIZE; - s->s_blocksize_bits = PAGE_CACHE_SHIFT; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = &simple_super_operations; s->s_time_gran = 1; @@ -994,12 +991,12 @@ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; u64 last_fs_page = - last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); + last_fs_block >> (PAGE_SHIFT - blocksize_bits); if (unlikely(num_blocks == 0)) return 0; - if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) + if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || @@ -1127,8 +1124,8 @@ static int empty_dir_setxattr(struct dentry *dentry, const char *name, return -EOPNOTSUPP; } -static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size) +static ssize_t empty_dir_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) { return -EOPNOTSUPP; } @@ -1169,7 +1166,7 @@ static int empty_dir_readdir(struct file *file, struct dir_context *ctx) static const struct file_operations empty_dir_operations = { .llseek = empty_dir_llseek, .read = generic_read_dir, - .iterate = empty_dir_readdir, + .iterate_shared = empty_dir_readdir, .fsync = noop_fsync, }; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index a709d80c8ebc..cc26f8f215f5 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -64,7 +64,7 @@ static void writeseg_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { end_page_writeback(bvec->bv_page); - page_cache_release(bvec->bv_page); + put_page(bvec->bv_page); } bio_put(bio); if (atomic_dec_and_test(&super->s_pending_writes)) diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index 9c501449450d..b76a62b1978f 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -46,9 +46,9 @@ static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len, BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); - BUG_ON(len > PAGE_CACHE_SIZE); - page_start = ofs & PAGE_CACHE_MASK; - page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + BUG_ON(len > PAGE_SIZE); + page_start = ofs & PAGE_MASK; + page_end = PAGE_ALIGN(ofs + len) - 1; ret = mtd_write(mtd, ofs, len, &retlen, buf); if (ret || (retlen != len)) return -EIO; @@ -82,7 +82,7 @@ static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs, if (!page) continue; memset(page_address(page), 0xFF, PAGE_SIZE); - page_cache_release(page); + put_page(page); } return 0; } @@ -195,7 +195,7 @@ static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, page_address(page)); unlock_page(page); - page_cache_release(page); + put_page(page); if (err) return err; } diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 542468e9bfb4..2d5336bd4efd 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -183,7 +183,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) if (name->len != be16_to_cpu(dd->namelen) || memcmp(name->name, dd->name, name->len)) { kunmap_atomic(dd); - page_cache_release(page); + put_page(page); continue; } @@ -238,7 +238,7 @@ static int logfs_unlink(struct inode *dir, struct dentry *dentry) return PTR_ERR(page); } index = page->index; - page_cache_release(page); + put_page(page); mutex_lock(&super->s_dirop_mutex); logfs_add_transaction(dir, ta); @@ -316,7 +316,7 @@ static int logfs_readdir(struct file *file, struct dir_context *ctx) be16_to_cpu(dd->namelen), be64_to_cpu(dd->ino), dd->type); kunmap(page); - page_cache_release(page); + put_page(page); if (full) break; } @@ -349,7 +349,7 @@ static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, dd = kmap_atomic(page); ino = be64_to_cpu(dd->ino); kunmap_atomic(dd); - page_cache_release(page); + put_page(page); inode = logfs_iget(dir->i_sb, ino); if (IS_ERR(inode)) @@ -392,7 +392,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry, err = logfs_write_buf(dir, page, WF_LOCK); unlock_page(page); - page_cache_release(page); + put_page(page); if (!err) grow_dir(dir, index); return err; @@ -561,7 +561,7 @@ static int logfs_get_dd(struct inode *dir, struct dentry *dentry, map = kmap_atomic(page); memcpy(dd, map, sizeof(*dd)); kunmap_atomic(map); - page_cache_release(page); + put_page(page); return 0; } @@ -791,7 +791,7 @@ const struct inode_operations logfs_dir_iops = { const struct file_operations logfs_dir_fops = { .fsync = logfs_fsync, .unlocked_ioctl = logfs_ioctl, - .iterate = logfs_readdir, + .iterate_shared = logfs_readdir, .read = generic_read_dir, - .llseek = default_llseek, + .llseek = generic_file_llseek, }; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 61eaeb1b6cac..f01ddfb1a03b 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -15,21 +15,21 @@ static int logfs_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct page *page; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; - if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + if ((len == PAGE_SIZE) || PageUptodate(page)) return 0; - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_CACHE_SIZE - 1); + if ((pos & PAGE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_SIZE - 1); unsigned end = start + len; /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + zero_user_segments(page, 0, start, end, PAGE_SIZE); return 0; } return logfs_readpage_nolock(page); @@ -41,11 +41,11 @@ static int logfs_write_end(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; pgoff_t index = page->index; - unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned start = pos & (PAGE_SIZE - 1); unsigned end = start + copied; int ret = 0; - BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize); + BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize); BUG_ON(page->index > I3_BLOCKS); if (copied < len) { @@ -61,8 +61,8 @@ static int logfs_write_end(struct file *file, struct address_space *mapping, if (copied == 0) goto out; /* FIXME: do we need to update inode? */ - if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) { - i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end); + if (i_size_read(inode) < (index << PAGE_SHIFT) + end) { + i_size_write(inode, (index << PAGE_SHIFT) + end); mark_inode_dirty_sync(inode); } @@ -75,7 +75,7 @@ static int logfs_write_end(struct file *file, struct address_space *mapping, } out: unlock_page(page); - page_cache_release(page); + put_page(page); return ret ? ret : copied; } @@ -118,7 +118,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + pgoff_t end_index = i_size >> PAGE_SHIFT; unsigned offset; u64 bix; level_t level; @@ -142,7 +142,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc) return __logfs_writepage(page); /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); + offset = i_size & (PAGE_SIZE-1); if (bix > end_index || offset == 0) { unlock_page(page); return 0; /* don't care */ @@ -155,7 +155,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc) * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); return __logfs_writepage(page); } diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 20973c9e52f8..3fb8c6d67303 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -281,7 +281,7 @@ static struct page *logfs_get_read_page(struct inode *inode, u64 bix, static void logfs_put_read_page(struct page *page) { unlock_page(page); - page_cache_release(page); + put_page(page); } static void logfs_lock_write_page(struct page *page) @@ -323,7 +323,7 @@ repeat: return NULL; err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); if (unlikely(err)) { - page_cache_release(page); + put_page(page); if (err == -EEXIST) goto repeat; return NULL; @@ -342,7 +342,7 @@ static void logfs_unlock_write_page(struct page *page) static void logfs_put_write_page(struct page *page) { logfs_unlock_write_page(page); - page_cache_release(page); + put_page(page); } static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, @@ -562,7 +562,7 @@ static void indirect_free_block(struct super_block *sb, if (PagePrivate(page)) { ClearPagePrivate(page); - page_cache_release(page); + put_page(page); set_page_private(page, 0); } __free_block(sb, block); @@ -655,7 +655,7 @@ static void alloc_data_block(struct inode *inode, struct page *page) block->page = page; SetPagePrivate(page); - page_cache_get(page); + get_page(page); set_page_private(page, (unsigned long) block); block->ops = &indirect_block_ops; @@ -709,7 +709,7 @@ static u64 block_get_pointer(struct page *page, int index) static int logfs_read_empty(struct page *page) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); return 0; } @@ -1660,7 +1660,7 @@ static int truncate_data_block(struct inode *inode, struct page *page, if (err) return err; - zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE); + zero_user_segment(page, size - pageofs, PAGE_SIZE); return logfs_segment_write(inode, page, shadow); } @@ -1919,7 +1919,7 @@ static void move_page_to_inode(struct inode *inode, struct page *page) block->page = NULL; if (PagePrivate(page)) { ClearPagePrivate(page); - page_cache_release(page); + put_page(page); set_page_private(page, 0); } } @@ -1940,7 +1940,7 @@ static void move_inode_to_page(struct page *page, struct inode *inode) if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); set_page_private(page, (unsigned long) block); } @@ -1971,7 +1971,7 @@ int logfs_read_inode(struct inode *inode) logfs_disk_to_inode(di, inode); kunmap_atomic(di); move_page_to_inode(inode, page); - page_cache_release(page); + put_page(page); return 0; } diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index d270e4b2ab6b..1efd6055f4b0 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -90,9 +90,9 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); } - page_cache_release(page); + put_page(page); buf += copylen; len -= copylen; @@ -117,9 +117,9 @@ static void pad_partial_page(struct logfs_area *area) memset(page_address(page) + offset, 0xff, len); if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); } - page_cache_release(page); + put_page(page); } } @@ -129,20 +129,20 @@ static void pad_full_pages(struct logfs_area *area) struct logfs_super *super = logfs_super(sb); u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); u32 len = super->s_segsize - area->a_used_bytes; - pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT; - pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT; + pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT; + pgoff_t no_indizes = len >> PAGE_SHIFT; struct page *page; while (no_indizes) { page = get_mapping_page(sb, index, 0); BUG_ON(!page); /* FIXME: reserve a pool */ SetPageUptodate(page); - memset(page_address(page), 0xff, PAGE_CACHE_SIZE); + memset(page_address(page), 0xff, PAGE_SIZE); if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); } - page_cache_release(page); + put_page(page); index++; no_indizes--; } @@ -411,7 +411,7 @@ int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) if (IS_ERR(page)) return PTR_ERR(page); memcpy(buf, page_address(page) + offset, copylen); - page_cache_release(page); + put_page(page); buf += copylen; len -= copylen; @@ -499,7 +499,7 @@ static void move_btree_to_page(struct inode *inode, struct page *page, if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); set_page_private(page, (unsigned long) block); } block->ops = &indirect_block_ops; @@ -554,7 +554,7 @@ void move_page_to_btree(struct page *page) if (PagePrivate(page)) { ClearPagePrivate(page); - page_cache_release(page); + put_page(page); set_page_private(page, 0); } block->ops = &btree_block_ops; @@ -723,9 +723,9 @@ void freeseg(struct super_block *sb, u32 segno) continue; if (PagePrivate(page)) { ClearPagePrivate(page); - page_cache_release(page); + put_page(page); } - page_cache_release(page); + put_page(page); } } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 54360293bcb5..5751082dba52 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -48,7 +48,7 @@ void emergency_read_end(struct page *page) if (page == emergency_page) mutex_unlock(&emergency_mutex); else - page_cache_release(page); + put_page(page); } static void dump_segfile(struct super_block *sb) @@ -206,7 +206,7 @@ static int write_one_sb(struct super_block *sb, logfs_set_segment_erased(sb, segno, ec, 0); logfs_write_ds(sb, ds, segno, ec); err = super->s_devops->write_sb(sb, page); - page_cache_release(page); + put_page(page); return err; } @@ -366,24 +366,24 @@ static struct page *find_super_block(struct super_block *sb) return NULL; last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); if (!last || IS_ERR(last)) { - page_cache_release(first); + put_page(first); return NULL; } if (!logfs_check_ds(page_address(first))) { - page_cache_release(last); + put_page(last); return first; } /* First one didn't work, try the second superblock */ if (!logfs_check_ds(page_address(last))) { - page_cache_release(first); + put_page(first); return last; } /* Neither worked, sorry folks */ - page_cache_release(first); - page_cache_release(last); + put_page(first); + put_page(last); return NULL; } @@ -425,7 +425,7 @@ static int __logfs_read_sb(struct super_block *sb) super->s_data_levels = ds->ds_data_levels; super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels + super->s_data_levels; - page_cache_release(page); + put_page(page); return 0; } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index d19ac258105a..31dcd515b9d5 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -21,14 +21,14 @@ static int minix_readdir(struct file *, struct dir_context *); const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = minix_readdir, + .iterate_shared = minix_readdir, .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } /* @@ -38,10 +38,10 @@ static inline void dir_put_page(struct page *page) static unsigned minix_last_byte(struct inode *inode, unsigned long page_nr) { - unsigned last_byte = PAGE_CACHE_SIZE; + unsigned last_byte = PAGE_SIZE; - if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT)) - last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1); + if (page_nr == (inode->i_size >> PAGE_SHIFT)) + last_byte = inode->i_size & (PAGE_SIZE - 1); return last_byte; } @@ -92,8 +92,8 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) if (pos >= inode->i_size) return 0; - offset = pos & ~PAGE_CACHE_MASK; - n = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_MASK; + n = pos >> PAGE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; @@ -229,7 +229,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) lock_page(page); kaddr = (char*)page_address(page); dir_end = kaddr + minix_last_byte(dir, n); - limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize; + limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { de = (minix_dirent *)p; de3 = (minix3_dirent *)p; @@ -327,7 +327,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) } kaddr = kmap_atomic(page); - memset(kaddr, 0, PAGE_CACHE_SIZE); + memset(kaddr, 0, PAGE_SIZE); if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)kaddr; @@ -350,7 +350,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); fail: - page_cache_release(page); + put_page(page); return err; } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index a795a11e50c7..2887d1d95ce2 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -243,11 +243,11 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, out_dir: if (dir_de) { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } out_old: kunmap(old_page); - page_cache_release(old_page); + put_page(old_page); out: return err; } diff --git a/fs/mpage.c b/fs/mpage.c index 6bd9fd90964e..eedc644b78d7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -107,7 +107,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) * don't make any buffers if there is only one buffer on * the page and the page just needs to be set up to date */ - if (inode->i_blkbits == PAGE_CACHE_SHIFT && + if (inode->i_blkbits == PAGE_SHIFT && buffer_uptodate(bh)) { SetPageUptodate(page); return; @@ -145,7 +145,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, { struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; @@ -162,7 +162,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, if (page_has_buffers(page)) goto confused; - block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) @@ -249,7 +249,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } if (first_hole != blocks_per_page) { - zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); + zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); @@ -331,7 +331,7 @@ confused: * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: - * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * the end-of-file on blocksize < PAGE_SIZE setups. * * BH_Boundary explanation: * @@ -380,7 +380,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, &first_logical_block, get_block, gfp); } - page_cache_release(page); + put_page(page); } BUG_ON(!list_empty(pages)); if (bio) @@ -472,7 +472,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; - const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; @@ -542,7 +542,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * The page has no buffers: map it to disk */ BUG_ON(!PageUptodate(page)); - block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = (i_size - 1) >> blkbits; map_bh.b_page = page; for (page_block = 0; page_block < blocks_per_page; ) { @@ -574,7 +574,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, first_unmapped = page_block; page_is_mapped: - end_index = i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_SHIFT; if (page->index >= end_index) { /* * The page straddles i_size. It must be zeroed out on each @@ -584,11 +584,11 @@ page_is_mapped: * is zeroed when mapped, and writes to that region are not * written out to the file." */ - unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); + unsigned offset = i_size & (PAGE_SIZE - 1); if (page->index > end_index || !offset) goto confused; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); } /* diff --git a/fs/namei.c b/fs/namei.c index 794f81dce766..15b124c18ed8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -265,7 +265,7 @@ static int check_acl(struct inode *inode, int mask) if (!acl) return -EAGAIN; /* no ->get_acl() calls in RCU mode... */ - if (acl == ACL_NOT_CACHED) + if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } @@ -1603,32 +1603,42 @@ static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { - struct dentry *dentry; - inode_lock(dir->d_inode); - dentry = d_lookup(dir, name); - if (unlikely(dentry)) { + struct dentry *dentry = ERR_PTR(-ENOENT), *old; + struct inode *inode = dir->d_inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + inode_lock_shared(inode); + /* Don't go there if it's already dead */ + if (unlikely(IS_DEADDIR(inode))) + goto out; +again: + dentry = d_alloc_parallel(dir, name, &wq); + if (IS_ERR(dentry)) + goto out; + if (unlikely(!d_in_lookup(dentry))) { if ((dentry->d_flags & DCACHE_OP_REVALIDATE) && !(flags & LOOKUP_NO_REVAL)) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { - if (!error) + if (!error) { d_invalidate(dentry); + dput(dentry); + goto again; + } dput(dentry); dentry = ERR_PTR(error); } } - if (dentry) { - inode_unlock(dir->d_inode); - return dentry; + } else { + old = inode->i_op->lookup(inode, dentry, flags); + d_lookup_done(dentry); + if (unlikely(old)) { + dput(dentry); + dentry = old; } } - dentry = d_alloc(dir, name); - if (unlikely(!dentry)) { - inode_unlock(dir->d_inode); - return ERR_PTR(-ENOMEM); - } - dentry = lookup_real(dir->d_inode, dentry, flags); - inode_unlock(dir->d_inode); +out: + inode_unlock_shared(inode); return dentry; } @@ -1740,15 +1750,17 @@ static int walk_component(struct nameidata *nd, int flags) nd->flags); if (IS_ERR(path.dentry)) return PTR_ERR(path.dentry); - if (unlikely(d_is_negative(path.dentry))) { - dput(path.dentry); - return -ENOENT; - } + path.mnt = nd->path.mnt; err = follow_managed(&path, nd); if (unlikely(err < 0)) return err; + if (unlikely(d_is_negative(path.dentry))) { + path_to_nameidata(&path, nd); + return -ENOENT; + } + seq = 0; /* we are already out of RCU mode */ inode = d_backing_inode(path.dentry); } @@ -1792,30 +1804,49 @@ static inline unsigned int fold_hash(unsigned long hash) return hash_64(hash, 32); } +/* + * This is George Marsaglia's XORSHIFT generator. + * It implements a maximum-period LFSR in only a few + * instructions. It also has the property (required + * by hash_name()) that mix_hash(0) = 0. + */ +static inline unsigned long mix_hash(unsigned long hash) +{ + hash ^= hash << 13; + hash ^= hash >> 7; + hash ^= hash << 17; + return hash; +} + #else /* 32-bit case */ #define fold_hash(x) (x) +static inline unsigned long mix_hash(unsigned long hash) +{ + hash ^= hash << 13; + hash ^= hash >> 17; + hash ^= hash << 5; + return hash; +} + #endif unsigned int full_name_hash(const unsigned char *name, unsigned int len) { - unsigned long a, mask; - unsigned long hash = 0; + unsigned long a, hash = 0; for (;;) { a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; - hash += a; - hash *= 9; + hash = mix_hash(hash + a); name += sizeof(unsigned long); len -= sizeof(unsigned long); if (!len) goto done; } - mask = bytemask_from_count(len); - hash += mask & a; + hash += a & bytemask_from_count(len); done: return fold_hash(hash); } @@ -1833,7 +1864,7 @@ static inline u64 hash_name(const char *name) hash = a = 0; len = -sizeof(unsigned long); do { - hash = (hash + a) * 9; + hash = mix_hash(hash + a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); @@ -2265,6 +2296,33 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, EXPORT_SYMBOL(vfs_path_lookup); /** + * lookup_hash - lookup single pathname component on already hashed name + * @name: name and hash to lookup + * @base: base directory to lookup from + * + * The name must have been verified and hashed (see lookup_one_len()). Using + * this after just full_name_hash() is unsafe. + * + * This function also doesn't check for search permission on base directory. + * + * Use lookup_one_len_unlocked() instead, unless you really know what you are + * doing. + * + * Do not hold i_mutex; this helper takes i_mutex if necessary. + */ +struct dentry *lookup_hash(const struct qstr *name, struct dentry *base) +{ + struct dentry *ret; + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow(name, base, 0); + + return ret; +} +EXPORT_SYMBOL(lookup_hash); + +/** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from @@ -2335,7 +2393,6 @@ struct dentry *lookup_one_len_unlocked(const char *name, struct qstr this; unsigned int c; int err; - struct dentry *ret; this.name = name; this.len = len; @@ -2367,10 +2424,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (err) return ERR_PTR(err); - ret = lookup_dcache(&this, base, 0); - if (!ret) - ret = lookup_slow(&this, base, 0); - return ret; + return lookup_hash(&this, base); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2653,7 +2707,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) return NULL; } - mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + mutex_lock(&p1->d_sb->s_vfs_rename_mutex); p = d_ancestor(p2, p1); if (p) { @@ -2680,7 +2734,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); - mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); @@ -2783,7 +2837,7 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) +static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) @@ -2812,155 +2866,56 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) static int atomic_open(struct nameidata *nd, struct dentry *dentry, struct path *path, struct file *file, const struct open_flags *op, - bool got_write, bool need_lookup, + int open_flag, umode_t mode, int *opened) { + struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; - unsigned open_flag = open_to_namei_flags(op->open_flag); - umode_t mode; int error; - int acc_mode; - int create_error = 0; - struct dentry *const DENTRY_NOT_SET = (void *) -1UL; - bool excl; - - BUG_ON(dentry->d_inode); - - /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(dir))) { - error = -ENOENT; - goto out; - } - - mode = op->mode; - if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) - mode &= ~current_umask(); - excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT); - if (excl) + if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */ open_flag &= ~O_TRUNC; - /* - * Checking write permission is tricky, bacuse we don't know if we are - * going to actually need it: O_CREAT opens should work as long as the - * file exists. But checking existence breaks atomicity. The trick is - * to check access and if not granted clear O_CREAT from the flags. - * - * Another problem is returing the "right" error value (e.g. for an - * O_EXCL open we want to return EEXIST not EROFS). - */ - if (((open_flag & (O_CREAT | O_TRUNC)) || - (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { - if (!(open_flag & O_CREAT)) { - /* - * No O_CREATE -> atomicity not a requirement -> fall - * back to lookup + open - */ - goto no_open; - } else if (open_flag & (O_EXCL | O_TRUNC)) { - /* Fall back and fail with the right error */ - create_error = -EROFS; - goto no_open; - } else { - /* No side effects, safe to clear O_CREAT */ - create_error = -EROFS; - open_flag &= ~O_CREAT; - } - } - - if (open_flag & O_CREAT) { - error = may_o_create(&nd->path, dentry, mode); - if (error) { - create_error = error; - if (open_flag & O_EXCL) - goto no_open; - open_flag &= ~O_CREAT; - } - } - if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; - error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode, - opened); - if (error < 0) { - if (create_error && error == -ENOENT) - error = create_error; - goto out; - } - - if (error) { /* returned 1, that is */ + error = dir->i_op->atomic_open(dir, dentry, file, + open_to_namei_flags(open_flag), + mode, opened); + d_lookup_done(dentry); + if (!error) { + /* + * We didn't have the inode before the open, so check open + * permission here. + */ + int acc_mode = op->acc_mode; + if (*opened & FILE_CREATED) { + WARN_ON(!(open_flag & O_CREAT)); + fsnotify_create(dir, dentry); + acc_mode = 0; + } + error = may_open(&file->f_path, acc_mode, open_flag); + if (WARN_ON(error > 0)) + error = -EINVAL; + } else if (error > 0) { if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; - goto out; - } - if (file->f_path.dentry) { - dput(dentry); - dentry = file->f_path.dentry; - } - if (*opened & FILE_CREATED) - fsnotify_create(dir, dentry); - if (!dentry->d_inode) { - WARN_ON(*opened & FILE_CREATED); - if (create_error) { - error = create_error; - goto out; - } } else { - if (excl && !(*opened & FILE_CREATED)) { - error = -EEXIST; - goto out; + if (file->f_path.dentry) { + dput(dentry); + dentry = file->f_path.dentry; } + if (*opened & FILE_CREATED) + fsnotify_create(dir, dentry); + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; } - goto looked_up; - } - - /* - * We didn't have the inode before the open, so check open permission - * here. - */ - acc_mode = op->acc_mode; - if (*opened & FILE_CREATED) { - WARN_ON(!(open_flag & O_CREAT)); - fsnotify_create(dir, dentry); - acc_mode = 0; } - error = may_open(&file->f_path, acc_mode, open_flag); - if (error) - fput(file); - -out: dput(dentry); return error; - -no_open: - if (need_lookup) { - dentry = lookup_real(dir, dentry, nd->flags); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - if (create_error) { - int open_flag = op->open_flag; - - error = create_error; - if ((open_flag & O_EXCL)) { - if (!dentry->d_inode) - goto out; - } else if (!dentry->d_inode) { - goto out; - } else if ((open_flag & O_TRUNC) && - d_is_reg(dentry)) { - goto out; - } - /* will fail later, go on to get the right error */ - } - } -looked_up: - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 1; } /* @@ -2988,62 +2943,118 @@ static int lookup_open(struct nameidata *nd, struct path *path, { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; + int open_flag = op->open_flag; struct dentry *dentry; - int error; - bool need_lookup = false; + int error, create_error = 0; + umode_t mode = op->mode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + if (unlikely(IS_DEADDIR(dir_inode))) + return -ENOENT; *opened &= ~FILE_CREATED; - dentry = lookup_dcache(&nd->last, dir, nd->flags); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + dentry = d_lookup(dir, &nd->last); + for (;;) { + if (!dentry) { + dentry = d_alloc_parallel(dir, &nd->last, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (d_in_lookup(dentry)) + break; - if (!dentry) { - dentry = d_alloc(dir, &nd->last); - if (unlikely(!dentry)) - return -ENOMEM; - need_lookup = true; - } else if (dentry->d_inode) { + if (!(dentry->d_flags & DCACHE_OP_REVALIDATE)) + break; + + error = d_revalidate(dentry, nd->flags); + if (likely(error > 0)) + break; + if (error) + goto out_dput; + d_invalidate(dentry); + dput(dentry); + dentry = NULL; + } + if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ goto out_no_open; } - if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { - return atomic_open(nd, dentry, path, file, op, got_write, - need_lookup, opened); + /* + * Checking write permission is tricky, bacuse we don't know if we are + * going to actually need it: O_CREAT opens should work as long as the + * file exists. But checking existence breaks atomicity. The trick is + * to check access and if not granted clear O_CREAT from the flags. + * + * Another problem is returing the "right" error value (e.g. for an + * O_EXCL open we want to return EEXIST not EROFS). + */ + if (open_flag & O_CREAT) { + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); + if (unlikely(!got_write)) { + create_error = -EROFS; + open_flag &= ~O_CREAT; + if (open_flag & (O_EXCL | O_TRUNC)) + goto no_open; + /* No side effects, safe to clear O_CREAT */ + } else { + create_error = may_o_create(&nd->path, dentry, mode); + if (create_error) { + open_flag &= ~O_CREAT; + if (open_flag & O_EXCL) + goto no_open; + } + } + } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) && + unlikely(!got_write)) { + /* + * No O_CREATE -> atomicity not a requirement -> fall + * back to lookup + open + */ + goto no_open; } - if (need_lookup) { - BUG_ON(dentry->d_inode); + if (dir_inode->i_op->atomic_open) { + error = atomic_open(nd, dentry, path, file, op, open_flag, + mode, opened); + if (unlikely(error == -ENOENT) && create_error) + error = create_error; + return error; + } - dentry = lookup_real(dir_inode, dentry, nd->flags); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); +no_open: + if (d_in_lookup(dentry)) { + struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, + nd->flags); + d_lookup_done(dentry); + if (unlikely(res)) { + if (IS_ERR(res)) { + error = PTR_ERR(res); + goto out_dput; + } + dput(dentry); + dentry = res; + } } /* Negative dentry, just create the file */ - if (!dentry->d_inode && (op->open_flag & O_CREAT)) { - umode_t mode = op->mode; - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - /* - * This write is needed to ensure that a - * rw->ro transition does not occur between - * the time when the file is created and when - * a permanent write count is taken through - * the 'struct file' in finish_open(). - */ - if (!got_write) { - error = -EROFS; - goto out_dput; - } + if (!dentry->d_inode && (open_flag & O_CREAT)) { *opened |= FILE_CREATED; - error = security_path_mknod(&nd->path, dentry, mode, 0); - if (error) + audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); + if (!dir_inode->i_op->create) { + error = -EACCES; goto out_dput; - error = vfs_create(dir->d_inode, dentry, mode, - nd->flags & LOOKUP_EXCL); + } + error = dir_inode->i_op->create(dir_inode, dentry, mode, + open_flag & O_EXCL); if (error) goto out_dput; + fsnotify_create(dir_inode, dentry); + } + if (unlikely(create_error) && !dentry->d_inode) { + error = create_error; + goto out_dput; } out_no_open: path->dentry = dentry; @@ -3115,7 +3126,7 @@ static int do_last(struct nameidata *nd, } retry_lookup: - if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { + if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) got_write = true; @@ -3125,9 +3136,15 @@ retry_lookup: * dropping this one anyway. */ } - inode_lock(dir->d_inode); + if (open_flag & O_CREAT) + inode_lock(dir->d_inode); + else + inode_lock_shared(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); - inode_unlock(dir->d_inode); + if (open_flag & O_CREAT) + inode_unlock(dir->d_inode); + else + inode_unlock_shared(dir->d_inode); if (error <= 0) { if (error) @@ -3207,10 +3224,6 @@ finish_open: return error; } audit_inode(nd->name, nd->path.dentry, 0); - if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) { - error = -ELOOP; - goto out; - } error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; @@ -3227,11 +3240,9 @@ finish_open: got_write = true; } finish_open_created: - if (likely(!(open_flag & O_PATH))) { - error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - } + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto out; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); if (!error) { @@ -3243,18 +3254,13 @@ finish_open_created: } opened: error = open_check_o_direct(file); - if (error) - goto exit_fput; - error = ima_file_check(file, op->acc_mode, *opened); - if (error) - goto exit_fput; - - if (will_truncate) { + if (!error) + error = ima_file_check(file, op->acc_mode, *opened); + if (!error && will_truncate) error = handle_truncate(file); - if (error) - goto exit_fput; - } out: + if (unlikely(error) && (*opened & FILE_OPENED)) + fput(file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3264,10 +3270,6 @@ out: path_put(&save_parent); return error; -exit_fput: - fput(file); - goto out; - stale_open: /* If no saved parent or already retried then can't retry */ if (!save_parent.dentry || retried) @@ -3345,6 +3347,18 @@ out: return error; } +static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) +{ + struct path path; + int error = path_lookupat(nd, flags, &path); + if (!error) { + audit_inode(nd->name, path.dentry, 0); + error = vfs_open(&path, file, current_cred()); + path_put(&path); + } + return error; +} + static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { @@ -3364,6 +3378,13 @@ static struct file *path_openat(struct nameidata *nd, goto out2; } + if (unlikely(file->f_flags & O_PATH)) { + error = do_o_path(nd, flags, file); + if (!error) + opened |= FILE_OPENED; + goto out2; + } + s = path_init(nd, flags); if (IS_ERR(s)) { put_filp(file); @@ -3606,6 +3627,8 @@ retry: switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(path.dentry->d_inode,dentry,mode,true); + if (!error) + ima_post_path_mknod(dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(path.dentry->d_inode,dentry,mode, @@ -4211,7 +4234,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; - if (source == target) + /* + * Check source == target. + * On overlayfs need to look at underlying inodes. + */ + if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0)) return 0; error = may_delete(old_dir, old_dentry, is_dir); @@ -4515,7 +4542,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link) out: return len; } -EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index b7f8eaeea5d8..bfdad003ee56 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -510,7 +510,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) kunmap(ctl.page); SetPageUptodate(ctl.page); unlock_page(ctl.page); - page_cache_release(ctl.page); + put_page(ctl.page); ctl.page = NULL; } ctl.idx = 0; @@ -520,7 +520,7 @@ invalid_cache: if (ctl.page) { kunmap(ctl.page); unlock_page(ctl.page); - page_cache_release(ctl.page); + put_page(ctl.page); ctl.page = NULL; } ctl.cache = cache; @@ -554,14 +554,14 @@ finished: kunmap(ctl.page); SetPageUptodate(ctl.page); unlock_page(ctl.page); - page_cache_release(ctl.page); + put_page(ctl.page); } if (page) { cache->head = ctl.head; kunmap(page); SetPageUptodate(page); unlock_page(page); - page_cache_release(page); + put_page(page); } out: return result; @@ -649,7 +649,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, kunmap(ctl.page); SetPageUptodate(ctl.page); unlock_page(ctl.page); - page_cache_release(ctl.page); + put_page(ctl.page); } ctl.cache = NULL; ctl.idx -= NCP_DIRCACHE_SIZE; diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 5233fbc1747a..17cfb743b5bf 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -191,7 +191,7 @@ struct ncp_cache_head { int eof; }; -#define NCP_DIRCACHE_SIZE ((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *))) +#define NCP_DIRCACHE_SIZE ((int)(PAGE_SIZE/sizeof(struct dentry *))) union ncp_dir_cache { struct ncp_cache_head head; struct dentry *dentry[NCP_DIRCACHE_SIZE]; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 02e4d87d2ed3..17a42e4eb872 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -231,7 +231,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) size_t bytes_left = header->args.count; unsigned int pg_offset = header->args.pgbase, pg_len; struct page **pages = header->args.pages; - int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + int pg_index = header->args.pgbase >> PAGE_SHIFT; const bool is_dio = (header->dreq != NULL); struct blk_plug plug; int i; @@ -263,13 +263,13 @@ bl_read_pagelist(struct nfs_pgio_header *header) } if (is_dio) { - if (pg_offset + bytes_left > PAGE_CACHE_SIZE) - pg_len = PAGE_CACHE_SIZE - pg_offset; + if (pg_offset + bytes_left > PAGE_SIZE) + pg_len = PAGE_SIZE - pg_offset; else pg_len = bytes_left; } else { BUG_ON(pg_offset != 0); - pg_len = PAGE_CACHE_SIZE; + pg_len = PAGE_SIZE; } if (is_hole(&be)) { @@ -339,9 +339,9 @@ static void bl_write_cleanup(struct work_struct *work) if (likely(!hdr->pnfs_error)) { struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg); - u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK; + u64 start = hdr->args.offset & (loff_t)PAGE_MASK; u64 end = (hdr->args.offset + hdr->args.count + - PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK; + PAGE_SIZE - 1) & (loff_t)PAGE_MASK; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, (end - start) >> SECTOR_SHIFT); @@ -373,7 +373,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) loff_t offset = header->args.offset; size_t count = header->args.count; struct page **pages = header->args.pages; - int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + int pg_index = header->args.pgbase >> PAGE_SHIFT; unsigned int pg_len; struct blk_plug plug; int i; @@ -392,7 +392,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) blk_start_plug(&plug); /* we always write out the whole page */ - offset = offset & (loff_t)PAGE_CACHE_MASK; + offset = offset & (loff_t)PAGE_MASK; isect = offset >> SECTOR_SHIFT; for (i = pg_index; i < header->page_array.npages; i++) { @@ -408,7 +408,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) extent_length = be.be_length - (isect - be.be_f_offset); } - pg_len = PAGE_CACHE_SIZE; + pg_len = PAGE_SIZE; bio = do_add_page_to_bio(bio, header->page_array.npages - i, WRITE, isect, pages[i], &map, &be, bl_end_io_write, par, @@ -820,7 +820,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) pgoff_t end; /* Optimize common case that writes from 0 to end of file */ - end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (end != inode->i_mapping->nrpages) { rcu_read_lock(); end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); @@ -828,9 +828,9 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) } if (!end) - return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT); + return i_size_read(inode) - (idx << PAGE_SHIFT); else - return (end - idx) << PAGE_CACHE_SHIFT; + return (end - idx) << PAGE_SHIFT; } static void diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index bc21205309e0..18e6fd0b9506 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -40,8 +40,8 @@ #include "../pnfs.h" #include "../netns.h" -#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) -#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) +#define PAGE_CACHE_SECTORS (PAGE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTOR_SIZE (1 << SECTOR_SHIFT) struct pnfs_block_dev; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 618ced381a14..aaa2e8d3df6f 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -217,7 +217,8 @@ static u32 initiate_file_draining(struct nfs_client *clp, } if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, - &args->cbl_range)) { + &args->cbl_range, + be32_to_cpu(args->cbl_stateid.seqid))) { rv = NFS4_OK; goto unlock; } @@ -500,8 +501,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, cps->slot = slot; /* The ca_maxresponsesize_cached is 0 with no DRC */ - if (args->csa_cachethis != 0) - return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + if (args->csa_cachethis != 0) { + status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + goto out_unlock; + } /* * Check for pending referring calls. If a match is found, a diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 976c90608e56..d81f96aacd51 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) p = read_buf(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(stateid, p, NFS4_STATEID_SIZE); + memcpy(stateid->data, p, NFS4_STATEID_SIZE); return 0; } +static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) { __be32 *p; @@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 *p; __be32 status; - status = decode_stateid(xdr, &args->stateid); + status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; p = read_buf(xdr, 4); @@ -227,6 +233,11 @@ out: } #if defined(CONFIG_NFS_V4_1) +static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, @@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, } p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_stateid(xdr, &args->cbl_stateid); + status = decode_layout_stateid(xdr, &args->cbl_stateid); if (unlikely(status != 0)) goto out; } else if (args->cbl_recall_type == RETURN_FSID) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d6d5d2a48e83..0c96528db94a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -736,7 +736,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; - server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; @@ -745,13 +745,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; - server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES) - server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES) + server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES; if (server->dtsize > server->rsize) server->dtsize = server->rsize; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5166adcfc0fb..322c2585bc34 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp) /** * nfs4_copy_delegation_stateid - Copy inode's state ID information - * @dst: stateid data structure to fill in * @inode: inode to check * @flags: delegation type requirement + * @dst: stateid data structure to fill in + * @cred: optional argument to retrieve credential * * Returns "true" and fills in "dst->data" * if inode had a delegation, * otherwise "false" is returned. */ -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, - fmode_t flags) +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, + nfs4_stateid *dst, struct rpc_cred **cred) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; @@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); + if (cred) + *cred = get_rpccred(delegation->cred); } rcu_read_unlock(); return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 333063e032f0..64724d252a79 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4bfa7d8bcade..aaf7bd0cbae2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .iterate = nfs_readdir, + .iterate_shared = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -145,6 +145,7 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { + atomic_t refcount; int size; int eof_index; u64 last_cookie; @@ -200,11 +201,20 @@ void nfs_readdir_clear_array(struct page *page) int i; array = kmap_atomic(page); - for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); + if (atomic_dec_and_test(&array->refcount)) + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); kunmap_atomic(array); } +static bool grab_page(struct page *page) +{ + struct nfs_cache_array *array = kmap_atomic(page); + bool res = atomic_inc_not_zero(&array->refcount); + kunmap_atomic(array); + return res; +} + /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in @@ -377,7 +387,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); - error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, + error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -470,6 +480,7 @@ static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { struct qstr filename = QSTR_INIT(entry->name, entry->len); + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *dir = d_inode(parent); @@ -489,7 +500,13 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) filename.hash = full_name_hash(filename.name, filename.len); dentry = d_lookup(parent, &filename); - if (dentry != NULL) { +again: + if (!dentry) { + dentry = d_alloc_parallel(parent, &filename, &wq); + if (IS_ERR(dentry)) + return; + } + if (!d_in_lookup(dentry)) { /* Is there a mountpoint here? If so, just exit */ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, &entry->fattr->fsid)) @@ -503,26 +520,21 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) } else { d_invalidate(dentry); dput(dentry); + dentry = NULL; + goto again; } } - dentry = d_alloc(parent, &filename); - if (dentry == NULL) - return; - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); - if (IS_ERR(inode)) - goto out; - alias = d_splice_alias(inode, dentry); - if (IS_ERR(alias)) - goto out; - else if (alias) { - nfs_set_verifier(alias, nfs_save_change_attribute(dir)); - dput(alias); - } else - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - + d_lookup_done(dentry); + if (alias) { + if (IS_ERR(alias)) + goto out; + dput(dentry); + dentry = alias; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out: dput(dentry); } @@ -560,7 +572,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en count++; if (desc->plus != 0) - nfs_prime_dcache(desc->file->f_path.dentry, entry); + nfs_prime_dcache(file_dentry(desc->file), entry); status = nfs_readdir_add_to_array(entry, page); if (status != 0) @@ -643,6 +655,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out_label_free; } memset(array, 0, sizeof(struct nfs_cache_array)); + atomic_set(&array->refcount, 1); array->eof_index = -1; status = nfs_readdir_alloc_pages(pages, array_size); @@ -705,17 +718,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { - if (!desc->page->mapping) - nfs_readdir_clear_array(desc->page); - page_cache_release(desc->page); + nfs_readdir_clear_array(desc->page); + put_page(desc->page); desc->page = NULL; } static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - return read_cache_page(file_inode(desc->file)->i_mapping, + struct page *page; + + for (;;) { + page = read_cache_page(file_inode(desc->file)->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); + if (IS_ERR(page) || grab_page(page)) + break; + put_page(page); + } + return page; } /* @@ -864,7 +884,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir) */ static int nfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); nfs_readdir_descriptor_t my_desc, *desc = &my_desc; @@ -889,7 +909,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; - nfs_block_sillyrename(dentry); if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -925,7 +944,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } while (!desc->eof); out: - nfs_unblock_sillyrename(dentry); if (res > 0) res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); @@ -934,13 +952,11 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { - struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -948,16 +964,13 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset >= 0) break; default: - offset = -EINVAL; - goto out; + return -EINVAL; } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } -out: - inode_unlock(inode); return offset; } @@ -1383,7 +1396,6 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ trace_nfs_lookup_enter(dir, dentry, flags); - nfs_block_sillyrename(parent); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) goto no_entry; @@ -1408,7 +1420,6 @@ no_entry: } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unblock_sillyrename: - nfs_unblock_sillyrename(parent); trace_nfs_lookup_exit(dir, dentry, flags, error); nfs4_label_free(label); out: @@ -1520,9 +1531,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, goto out; trace_nfs_atomic_open_enter(dir, ctx, open_flags); - nfs_block_sillyrename(dentry->d_parent); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); - nfs_unblock_sillyrename(dentry->d_parent); if (IS_ERR(inode)) { err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); @@ -1766,7 +1775,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) trace_nfs_rmdir_enter(dir, dentry); if (d_really_is_positive(dentry)) { - nfs_wait_on_sillyrename(dentry); + down_write(&NFS_I(d_inode(dentry))->rmdir_sem); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ switch (error) { @@ -1776,6 +1785,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) case -ENOENT: nfs_dentry_handle_enoent(dentry); } + up_write(&NFS_I(d_inode(dentry))->rmdir_sem); } else error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); trace_nfs_rmdir_exit(dir, dentry, error); @@ -1923,7 +1933,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * add_to_page_cache_lru() grabs an extra page refcount. * Drop it here to avoid leaking this page later. */ - page_cache_release(page); + put_page(page); } else __free_page(page); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7a0cfd3266e5..979b3c4dee6a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -87,6 +87,7 @@ struct nfs_direct_req { int mirror_count; ssize_t count, /* bytes actually processed */ + max_count, /* max expected count */ bytes_left, /* bytes left to be sent */ io_start, /* start of IO */ error; /* any reported error */ @@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; + WARN_ON_ONCE(dreq->count >= dreq->max_count); + if (dreq->mirror_count == 1) { dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; dreq->count += hdr->good_bytes; @@ -250,7 +253,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, * shunt off direct read and write requests before the VFS gets them, * so this method is only ever called for swap. */ -ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -261,7 +264,7 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter, pos); + return nfs_file_direct_read(iocb, iter); return nfs_file_direct_write(iocb, iter); } @@ -269,13 +272,13 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) { unsigned int i; for (i = 0; i < npages; i++) - page_cache_release(pages[i]); + put_page(pages[i]); } void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq) { - cinfo->lock = &dreq->inode->i_lock; + cinfo->inode = dreq->inode; cinfo->mds = &dreq->mds_cinfo; cinfo->ds = &dreq->ds_cinfo; cinfo->dreq = dreq; @@ -396,7 +399,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) static void nfs_direct_readpage_release(struct nfs_page *req) { dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - d_inode(req->wb_context->dentry)->i_sb->s_id, + req->wb_context->dentry->d_sb->s_id, (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); @@ -545,7 +548,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data - * @pos: byte offset in file where reading starts * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -561,8 +563,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -574,7 +575,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", - file, count, (long long) pos); + file, count, (long long) iocb->ki_pos); result = 0; if (!count) @@ -593,8 +594,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock; dreq->inode = inode; - dreq->bytes_left = count; - dreq->io_start = pos; + dreq->bytes_left = dreq->max_count = count; + dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { @@ -606,14 +607,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, dreq->iocb = iocb; NFS_I(inode)->read_io += count; - result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); if (result > 0) - iocb->ki_pos = pos + result; + iocb->ki_pos += result; } nfs_direct_req_release(dreq); @@ -632,13 +633,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); #ifdef CONFIG_NFS_V4_1 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); #endif nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) @@ -673,13 +674,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) if (!nfs_pageio_add_request(&desc, req)) { nfs_list_remove_request(req); nfs_list_add_request(req, &failed); - spin_lock(cinfo.lock); + spin_lock(&cinfo.inode->i_lock); dreq->flags = 0; if (desc.pg_error < 0) dreq->error = desc.pg_error; else dreq->error = -EIO; - spin_unlock(cinfo.lock); + spin_unlock(&cinfo.inode->i_lock); } nfs_release_request(req); } @@ -969,7 +970,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data - * @pos: byte offset in file where writing starts * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -1003,7 +1003,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) iov_iter_count(iter)); pos = iocb->ki_pos; - end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; + end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; inode_lock(inode); @@ -1013,7 +1013,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (mapping->nrpages) { result = invalidate_inode_pages2_range(mapping, - pos >> PAGE_CACHE_SHIFT, end); + pos >> PAGE_SHIFT, end); if (result) goto out_unlock; } @@ -1026,7 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; dreq->inode = inode; - dreq->bytes_left = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = iov_iter_count(iter); dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -1042,7 +1042,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, - pos >> PAGE_CACHE_SHIFT, end); + pos >> PAGE_SHIFT, end); } inode_unlock(inode); @@ -1057,7 +1057,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); - generic_write_sync(file, pos, result); + + /* XXX: should check the generic_write_sync retval */ + generic_write_sync(iocb, result); } } nfs_direct_req_release(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 89bf093d342a..717a8d6af52d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -164,7 +164,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to, iocb->ki_pos); + return nfs_file_direct_read(iocb, to); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -320,7 +320,7 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page, loff_t pos, unsigned len) { unsigned int pglen = nfs_page_length(page); - unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int offset = pos & (PAGE_SIZE - 1); unsigned int end = offset + len; if (pnfs_ld_read_whole_page(file->f_mapping->host)) { @@ -351,7 +351,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; struct page *page; int once_thru = 0; @@ -380,12 +380,12 @@ start: ret = nfs_flush_incompatible(file, page); if (ret) { unlock_page(page); - page_cache_release(page); + put_page(page); } else if (!once_thru && nfs_want_read_modify_write(file, page, pos, len)) { once_thru = 1; ret = nfs_readpage(file, page); - page_cache_release(page); + put_page(page); if (!ret) goto start; } @@ -396,7 +396,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned offset = pos & (PAGE_SIZE - 1); struct nfs_open_context *ctx = nfs_file_open_context(file); int status; @@ -413,20 +413,20 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, if (pglen == 0) { zero_user_segments(page, 0, offset, - end, PAGE_CACHE_SIZE); + end, PAGE_SIZE); SetPageUptodate(page); } else if (end >= pglen) { - zero_user_segment(page, end, PAGE_CACHE_SIZE); + zero_user_segment(page, end, PAGE_SIZE); if (offset == 0) SetPageUptodate(page); } else - zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + zero_user_segment(page, pglen, PAGE_SIZE); } status = nfs_updatepage(file, page, offset, copied); unlock_page(page); - page_cache_release(page); + put_page(page); if (status < 0) return status; @@ -454,7 +454,7 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", page, offset, length); - if (offset != 0 || length < PAGE_CACHE_SIZE) + if (offset != 0 || length < PAGE_SIZE) return; /* Cancel any unstarted writes on this page */ nfs_wb_page_cancel(page_file_mapping(page)->host, page); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3384dc8e6683..aa59757389dc 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets >= size) goto out; for (i = 0; i < cinfo->ds->nbuckets; i++) { @@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, swap(cinfo->ds->buckets, buckets); cinfo->ds->nbuckets = size; out: - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); kfree(buckets); return 0; } @@ -890,6 +890,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -915,6 +916,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0cb1abd535e3..0e8018bc9880 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -26,6 +26,8 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) +static struct group_info *ff_zero_group; + static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) { @@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) kfree(FF_LAYOUT_FROM_HDR(lo)); } -static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return -ENOBUFS; - memcpy(stateid, p, NFS4_STATEID_SIZE); + stateid->type = NFS4_PNFS_DS_STATEID_TYPE; + memcpy(stateid->data, p, NFS4_STATEID_SIZE); dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, p[0], p[1], p[2], p[3]); return 0; @@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { + struct rpc_cred *cred; + ff_layout_remove_mirror(mirror); kfree(mirror->fh_versions); - if (mirror->cred) - put_rpccred(mirror->cred); + cred = rcu_access_pointer(mirror->ro_cred); + if (cred) + put_rpccred(cred); + cred = rcu_access_pointer(mirror->rw_cred); + if (cred) + put_rpccred(cred); nfs4_ff_layout_put_deviceid(mirror->mirror_ds); kfree(mirror); } @@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new, { u64 new_end, old_end; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + return false; if (new->pls_range.iomode != old->pls_range.iomode) return false; old_end = pnfs_calc_offset_end(old->pls_range.offset, @@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new, new_end); if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) set_bit(NFS_LSEG_ROC, &new->pls_flags); - if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) - set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); return true; } @@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; - u32 ds_count; - u32 fh_count; + struct auth_cred acred = { .group_info = ff_zero_group }; + struct rpc_cred __rcu *cred; + u32 ds_count, fh_count, id; int j; rc = -EIO; @@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->efficiency = be32_to_cpup(p); /* stateid */ - rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); + rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); if (rc) goto out_err_free; @@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->fh_versions_cnt = fh_count; /* user */ - rc = decode_name(&stream, &fls->mirror_array[i]->uid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.uid = make_kuid(&init_user_ns, id); + /* group */ - rc = decode_name(&stream, &fls->mirror_array[i]->gid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.gid = make_kgid(&init_user_ns, id); + + /* find the cred for it */ + rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags)); + if (IS_ERR(cred)) { + rc = PTR_ERR(cred); + goto out_err_free; + } + + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + else + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->ro_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + } else { + cred = xchg(&mirror->rw_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; } - dprintk("%s: uid %d gid %d\n", __func__, - fls->mirror_array[i]->uid, - fls->mirror_array[i]->gid); + dprintk("%s: iomode %s uid %u gid %u\n", __func__, + lgr->range.iomode == IOMODE_READ ? "READ" : "RW", + from_kuid(&init_user_ns, acred.uid), + from_kgid(&init_user_ns, acred.gid)); } p = xdr_inline_decode(&stream, 4); @@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, else { int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets != 0) kfree(buckets); else { @@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, NFS_INVALID_STABLE_HOW; } } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return 0; } } @@ -786,6 +821,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, } static void +ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, + bool strict_iomode) +{ +retry_strict: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + strict_iomode, + GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + } + + /* If we don't have checking, do get a IOMODE_RW + * segment, and the server wants to avoid READs + * there, then retry! + */ + if (pgio->pg_lseg && !strict_iomode && + ff_layout_avoid_read_on_rw(pgio->pg_lseg)) { + strict_iomode = true; + goto retry_strict; + } +} + +static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; /* Use full layout for now */ - if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_READ, - GFP_KERNEL); - if (IS_ERR(pgio->pg_lseg)) { - pgio->pg_error = PTR_ERR(pgio->pg_lseg); - pgio->pg_lseg = NULL; - return; - } - } + if (!pgio->pg_lseg) + ff_layout_pg_get_read(pgio, req, false); + else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) + ff_layout_pg_get_read(pgio, req, true); + /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); - if (!ds) - goto out_mds; + if (!ds) { + if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_pnfs; + else + goto out_mds; + } + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgio->pg_mirror_idx = ds_idx; @@ -828,6 +890,12 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); + return; + +out_pnfs: + pnfs_set_lo_fail(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; } static void @@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, for (i = 0; i < pgio->pg_mirror_count; i++) { ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); - if (!ds) - goto out_mds; + if (!ds) { + if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_pnfs; + else + goto out_mds; + } pgm = &pgio->pg_mirrors[i]; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; @@ -883,6 +956,12 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); + return; + +out_pnfs: + pnfs_set_lo_fail(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; } static unsigned int @@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: - if (ff_layout_no_fallback_to_mds(lseg) || - ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return -NFS4ERR_RESET_TO_PNFS; reset: dprintk("%s Retry through MDS. Error %d\n", __func__, @@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &hdr->lseg->pls_layout->plh_flags); pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1260,7 +1337,7 @@ ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) } static bool -ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) +ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) { /* No mirroring for now */ struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); - if (ff_layout_has_available_ds(hdr->lseg)) - pnfs_read_resend_pnfs(hdr); - else - ff_layout_reset_read(hdr); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } - hdr->pgio_done_cb = ff_layout_read_done_cb; ff_layout_read_record_layoutstats_start(task, hdr); return 0; @@ -1496,14 +1567,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - bool retry_pnfs; - - retry_pnfs = ff_layout_has_available_ds(hdr->lseg); - dprintk("%s task %u reset io to %s\n", __func__, - task->tk_pid, retry_pnfs ? "pNFS" : "MDS"); - ff_layout_reset_write(hdr, retry_pnfs); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } @@ -1712,7 +1777,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1720,6 +1785,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + hdr->pgio_done_cb = ff_layout_read_done_cb; atomic_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); @@ -1737,11 +1803,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) vers == 3 ? &ff_layout_read_call_ops_v3 : &ff_layout_read_call_ops_v4, 0, RPC_TASK_SOFTCONN); - + put_rpccred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; return PNFS_NOT_ATTEMPTED; } @@ -1769,7 +1835,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) return PNFS_NOT_ATTEMPTED; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) return PNFS_NOT_ATTEMPTED; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1798,6 +1864,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) vers == 3 ? &ff_layout_write_call_ops_v3 : &ff_layout_write_call_ops_v4, sync, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); return PNFS_ATTEMPTED; } @@ -1824,7 +1891,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct rpc_clnt *ds_clnt; struct rpc_cred *ds_cred; u32 idx; - int vers; + int vers, ret; struct nfs_fh *fh; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); @@ -1838,7 +1905,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_err; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1854,10 +1921,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) if (fh) data->args.fh = fh; - return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, + ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, how, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); + return ret; out_err: pnfs_generic_prepare_to_resend_writes(data); pnfs_generic_commit_release(data); @@ -2223,6 +2292,11 @@ static int __init nfs4flexfilelayout_init(void) { printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", __func__); + if (!ff_zero_group) { + ff_zero_group = groups_alloc(0); + if (!ff_zero_group) + return -ENOMEM; + } return pnfs_register_layoutdriver(&flexfilelayout_type); } @@ -2231,6 +2305,10 @@ static void __exit nfs4flexfilelayout_exit(void) printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", __func__); pnfs_unregister_layoutdriver(&flexfilelayout_type); + if (ff_zero_group) { + put_group_info(ff_zero_group); + ff_zero_group = NULL; + } } MODULE_ALIAS("nfs-layouttype4-4"); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index dd353bb7dc0a..1bcdb15d0c41 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -10,7 +10,8 @@ #define FS_NFS_NFS4FLEXFILELAYOUT_H #define FF_FLAGS_NO_LAYOUTCOMMIT 1 -#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 #include "../pnfs.h" @@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror { u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - u32 uid; - u32 gid; - struct rpc_cred *cred; + struct rpc_cred __rcu *ro_cred; + struct rpc_cred __rcu *rw_cred; atomic_t ref; spinlock_t lock; struct nfs4_ff_layoutstat read_stat; @@ -154,6 +154,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg) } static inline bool +ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; +} + +static inline bool ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) { return nfs4_test_deviceid_unavailable(node); @@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred); bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); + #endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index add0e5a70bd6..0aa36be71fce 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, return e1->opnum < e2->opnum ? -1 : 1; if (e1->status != e2->status) return e1->status < e2->status ? -1 : 1; - ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + ret = memcmp(e1->stateid.data, e2->stateid.data, + sizeof(e1->stateid.data)); if (ret != 0) return ret; ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); @@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, return 0; } -/* currently we only support AUTH_NONE and AUTH_SYS */ -static rpc_authflavor_t -nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror) +static struct rpc_cred * +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) { - if (mirror->uid == (u32)-1) - return RPC_AUTH_NULL; - return RPC_AUTH_UNIX; -} + struct rpc_cred *cred, __rcu **pcred; -/* fetch cred for NFSv3 DS */ -static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, - struct nfs4_pnfs_ds *ds) -{ - if (ds->ds_clp && !mirror->cred && - mirror->mirror_ds->ds_versions[0].version == 3) { - struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; - struct rpc_cred *cred; - struct auth_cred acred = { - .uid = make_kuid(&init_user_ns, mirror->uid), - .gid = make_kgid(&init_user_ns, mirror->gid), - }; - - /* AUTH_NULL ignores acred */ - cred = auth->au_ops->lookup_cred(auth, &acred, 0); - if (IS_ERR(cred)) { - dprintk("%s: lookup_cred failed with %ld\n", - __func__, PTR_ERR(cred)); - return PTR_ERR(cred); - } else { - if (cmpxchg(&mirror->cred, NULL, cred)) - put_rpccred(cred); - } - } - return 0; + if (iomode == IOMODE_READ) + pcred = &mirror->ro_cred; + else + pcred = &mirror->rw_cred; + + rcu_read_lock(); + do { + cred = rcu_dereference(*pcred); + if (!cred) + break; + + cred = get_rpccred_rcu(cred); + } while(!cred); + rcu_read_unlock(); + return cred; } struct nfs_fh * @@ -356,7 +343,23 @@ out: return fh; } -/* Upon return, either ds is connected, or ds is NULL */ +/** + * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call + * @lseg: the layout segment we're operating on + * @ds_idx: index of the DS to use + * @fail_return: return layout on connect failure? + * + * Try to prepare a DS connection to accept an RPC call. This involves + * selecting a mirror to use and connecting the client to it if it's not + * already connected. + * + * Since we only need a single functioning mirror to satisfy a read, we don't + * want to return the layout if there is one. For writes though, any down + * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish + * between the two cases. + * + * Returns a pointer to a connected DS object on success or NULL on failure. + */ struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, bool fail_return) @@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - rpc_authflavor_t flavor; if (!ff_layout_mirror_valid(lseg, mirror)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", @@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); if (ds->ds_clp) - goto out_update_creds; - - flavor = nfs4_ff_layout_choose_authflavor(mirror); + goto out; /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. @@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version, - flavor); + RPC_AUTH_UNIX); /* connect success, check rsize/wsize limit */ if (ds->ds_clp) { @@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); - if (!fail_return) { - if (ff_layout_has_available_ds(lseg)) - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lseg->pls_layout->plh_flags); - else - pnfs_error_mark_layout_for_return(ino, lseg); - } else + if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; - goto out; } -out_update_creds: - if (ff_layout_update_mirror_cred(mirror, ds)) - ds = NULL; out: return ds; } @@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - struct rpc_cred *cred = ERR_PTR(-EINVAL); - - if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true)) - goto out; + struct rpc_cred *cred; - if (mirror && mirror->cred) - cred = mirror->cred; - else - cred = mdscred; -out: + if (mirror) { + cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + if (!cred) + cred = get_rpccred(mdscred); + } else { + cred = get_rpccred(mdscred); + } return cred; } @@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) return ff_rw_layout_has_available_ds(lseg); } +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg) +{ + return ff_layout_no_fallback_to_mds(lseg) || + ff_layout_has_available_ds(lseg); +} + +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return lseg->pls_range.iomode == IOMODE_RW && + ff_layout_no_read_on_rw(lseg); +} + module_param(dataserver_retrans, uint, 0644); MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " "retries a request before it attempts further " diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 33d18c411905..52e7d6869e3b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -940,7 +940,7 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); @@ -1958,9 +1958,7 @@ static void init_once(void *foo) nfsi->nrequests = 0; nfsi->commit_info.ncommit = 0; atomic_set(&nfsi->commit_info.rpcs_out, 0); - atomic_set(&nfsi->silly_count, 1); - INIT_HLIST_HEAD(&nfsi->silly_list); - init_waitqueue_head(&nfsi->waitqueue); + init_rwsem(&nfsi->rmdir_sem); nfs4_init_once(nfsi); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 565f8135ae1f..5154fa65a2f2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -638,11 +639,11 @@ unsigned int nfs_page_length(struct page *page) if (i_size > 0) { pgoff_t page_index = page_file_index(page); - pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; if (page_index < end_index) - return PAGE_CACHE_SIZE; + return PAGE_SIZE; if (page_index == end_index) - return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; + return ((i_size - 1) & ~PAGE_MASK) + 1; } return 0; } diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 17c0fa1eccfa..720d92f5abfb 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -11,6 +11,38 @@ #define NFSDBG_FACILITY NFSDBG_PROC +/* + * nfs3_prepare_get_acl, nfs3_complete_get_acl, nfs3_abort_get_acl: Helpers for + * caching get_acl results in a race-free way. See fs/posix_acl.c:get_acl() + * for explanations. + */ +static void nfs3_prepare_get_acl(struct posix_acl **p) +{ + struct posix_acl *sentinel = uncached_acl_sentinel(current); + + if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) { + /* Not the first reader or sentinel already in place. */ + } +} + +static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl) +{ + struct posix_acl *sentinel = uncached_acl_sentinel(current); + + /* Only cache the ACL if our sentinel is still in place. */ + posix_acl_dup(acl); + if (cmpxchg(p, sentinel, acl) != sentinel) + posix_acl_release(acl); +} + +static void nfs3_abort_get_acl(struct posix_acl **p) +{ + struct posix_acl *sentinel = uncached_acl_sentinel(current); + + /* Remove our sentinel upon failure. */ + cmpxchg(p, sentinel, ACL_NOT_CACHED); +} + struct posix_acl *nfs3_get_acl(struct inode *inode, int type) { struct nfs_server *server = NFS_SERVER(inode); @@ -55,6 +87,11 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) if (res.fattr == NULL) return ERR_PTR(-ENOMEM); + if (args.mask & NFS_ACL) + nfs3_prepare_get_acl(&inode->i_acl); + if (args.mask & NFS_DFACL) + nfs3_prepare_get_acl(&inode->i_default_acl); + status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -89,12 +126,12 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) } if (res.mask & NFS_ACL) - set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); + nfs3_complete_get_acl(&inode->i_acl, res.acl_access); else forget_cached_acl(inode, ACL_TYPE_ACCESS); if (res.mask & NFS_DFACL) - set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); + nfs3_complete_get_acl(&inode->i_default_acl, res.acl_default); else forget_cached_acl(inode, ACL_TYPE_DEFAULT); @@ -108,6 +145,8 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) } getout: + nfs3_abort_get_acl(&inode->i_acl); + nfs3_abort_get_acl(&inode->i_default_acl); posix_acl_release(res.acl_access); posix_acl_release(res.acl_default); nfs_free_fattr(res.fattr); diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index b587ccd31083..b6cd15314bab 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); +ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index dff83460e5a6..aa03ed09ba06 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } +static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, + struct nfs_lock_context *src_lock, + struct file *dst, loff_t pos_dst, + struct nfs_lock_context *dst_lock, + size_t count) +{ + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct inode *dst_inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(dst_inode); + int status; + + status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + src_lock, FMODE_READ); + if (status) + return status; + + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + dst_lock, FMODE_WRITE); + if (status) + return status; + + status = nfs4_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, 0); + if (status == -ENOTSUPP) + server->caps &= ~NFS_CAP_COPY; + if (status) + return status; + + if (res.write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (status) + return status; + } + + truncate_pagecache_range(dst_inode, pos_dst, + pos_dst + res.write_res.count); + + return res.write_res.count; +} + +ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, + struct file *dst, loff_t pos_dst, + size_t count) +{ + struct nfs_server *server = NFS_SERVER(file_inode(dst)); + struct nfs_lock_context *src_lock; + struct nfs_lock_context *dst_lock; + struct nfs4_exception src_exception = { }; + struct nfs4_exception dst_exception = { }; + ssize_t err, err2; + + if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) + return -EOPNOTSUPP; + + src_lock = nfs_get_lock_context(nfs_file_open_context(src)); + if (IS_ERR(src_lock)) + return PTR_ERR(src_lock); + + src_exception.inode = file_inode(src); + src_exception.state = src_lock->open_context->state; + + dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); + if (IS_ERR(dst_lock)) { + err = PTR_ERR(dst_lock); + goto out_put_src_lock; + } + + dst_exception.inode = file_inode(dst); + dst_exception.state = dst_lock->open_context->state; + + do { + inode_lock(file_inode(dst)); + err = _nfs42_proc_copy(src, pos_src, src_lock, + dst, pos_dst, dst_lock, count); + inode_unlock(file_inode(dst)); + + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } + + err2 = nfs4_handle_exception(server, err, &src_exception); + err = nfs4_handle_exception(server, err, &dst_exception); + if (!err) + err = err2; + } while (src_exception.retry || dst_exception.retry); + + nfs_put_lock_context(dst_lock); +out_put_src_lock: + nfs_put_lock_context(src_lock); + return err; +} + static loff_t _nfs42_proc_llseek(struct file *filep, struct nfs_lock_context *lock, loff_t offset, int whence) { @@ -232,7 +337,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) * with the current stateid. */ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0ca482a51e53..6dc6f2aea0d6 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -9,9 +9,22 @@ #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) +#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 /* wr_count */ + \ + 1 /* wr_committed */ + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) #define encode_allocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_allocate_maxsz (op_decode_hdr_maxsz) +#define encode_copy_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 + 2 + 2 + 1 + 1 + 1) +#define decode_copy_maxsz (op_decode_hdr_maxsz + \ + NFS42_WRITE_RES_SIZE + \ + 1 /* cr_consecutive */ + \ + 1 /* cr_synchronous */) #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) @@ -49,6 +62,16 @@ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_copy_maxsz) +#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_copy_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_copy(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->src_stateid); + encode_nfs4_stateid(xdr, &args->dst_stateid); + + encode_uint64(xdr, args->src_pos); + encode_uint64(xdr, args->dst_pos); + encode_uint64(xdr, args->count); + + encode_uint32(xdr, 1); /* consecutive = true */ + encode_uint32(xdr, 1); /* synchronous = true */ + encode_uint32(xdr, 0); /* src server list */ +} + static void encode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_args *args, struct compound_hdr *hdr) @@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, } /* + * Encode COPY request + */ +static void nfs4_xdr_enc_copy(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_copy_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->src_fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dst_fh, &hdr); + encode_copy(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode DEALLOCATE request */ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, @@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) return decode_op_hdr(xdr, OP_ALLOCATE); } +static int decode_write_response(struct xdr_stream *xdr, + struct nfs42_write_res *res) +{ + __be32 *p; + int stateids; + + p = xdr_inline_decode(xdr, 4 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + stateids = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &res->count); + res->verifier.committed = be32_to_cpup(p); + return decode_verifier(xdr, &res->verifier.verifier); + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy_requirements(struct xdr_stream *xdr, + struct nfs42_copy_res *res) { + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(!p)) + goto out_overflow; + + res->consecutive = be32_to_cpup(p++); + res->synchronous = be32_to_cpup(p++); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_COPY); + if (status == NFS4ERR_OFFLOAD_NO_REQS) { + status = decode_copy_requirements(xdr, res); + if (status) + return status; + return NFS4ERR_OFFLOAD_NO_REQS; + } else if (status) + return status; + + status = decode_write_response(xdr, &res->write_res); + if (status) + return status; + + return decode_copy_requirements(xdr, res); +} + static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_DEALLOCATE); @@ -331,6 +447,36 @@ out: } /* + * Decode COPY response + */ +static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_copy_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_copy(xdr, res); +out: + return status; +} + +/* * Decode DEALLOCATE request */ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4afdee420d25..768456fa1b17 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -438,8 +438,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, - fmode_t, const struct nfs_lockowner *); +extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, + const struct nfs_lockowner *, nfs4_stateid *, + struct rpc_cred **); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -496,12 +497,15 @@ extern struct svc_version nfs4_callback_version4; static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) { - memcpy(dst, src, sizeof(*dst)); + memcpy(dst->data, src->data, sizeof(dst->data)); + dst->type = src->type; } static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) { - return memcmp(dst, src, sizeof(*dst)) == 0; + if (dst->type != src->type) + return false; + return memcmp(dst->data, src->data, sizeof(dst->data)) == 0; } static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 22c35abbee9d..014b0e41ace5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -26,7 +26,7 @@ static int nfs4_file_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file_dentry(filp); struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; @@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; @@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id) } #ifdef CONFIG_NFS_V4_2 +static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t count, unsigned int flags) +{ + struct inode *in_inode = file_inode(file_in); + struct inode *out_inode = file_inode(file_out); + int ret; + + if (in_inode == out_inode) + return -EINVAL; + + /* flush any pending writes */ + ret = nfs_sync_inode(in_inode); + if (ret) + return ret; + ret = nfs_sync_inode(out_inode); + if (ret) + return ret; + + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); +} + static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) { loff_t ret; @@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = { .check_flags = nfs_check_flags, .setlease = simple_nosetlease, #ifdef CONFIG_NFS_V4_2 + .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, .clone_file_range = nfs42_clone_file_range, diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 5ba22c6b0ffa..c444285bb1b1 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -201,7 +201,7 @@ int nfs_idmap_init(void) GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 327b8c34d360..223982eb38c9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -74,6 +74,17 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +/* file attributes which can be mapped to nfs attributes */ +#define NFS4_VALID_ATTRS (ATTR_MODE \ + | ATTR_UID \ + | ATTR_GID \ + | ATTR_SIZE \ + | ATTR_ATIME \ + | ATTR_MTIME \ + | ATTR_CTIME \ + | ATTR_ATIME_SET \ + | ATTR_MTIME_SET) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); @@ -416,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -NFS4ERR_RECALLCONFLICT: exception->delay = 1; return 0; @@ -2558,15 +2570,20 @@ static int _nfs4_do_open(struct inode *dir, if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr, &label); - - nfs_fattr_init(opendata->o_res.f_attr); - status = nfs4_do_setattr(state->inode, cred, - opendata->o_res.f_attr, sattr, - state, label, olabel); - if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr, - opendata->o_res.f_attr); - nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + /* + * send create attributes which was not set by open + * with an extra setattr. + */ + if (sattr->ia_valid & NFS4_VALID_ATTRS) { + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state, label, olabel); + if (status == 0) { + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } } } if (opened && opendata->file_created) @@ -2676,6 +2693,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_cred *delegation_cred = NULL; unsigned long timestamp = jiffies; fmode_t fmode; bool truncate; @@ -2691,7 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; fmode = truncate ? FMODE_WRITE : FMODE_READ; - if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { /* Use that stateid */ } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { @@ -2700,13 +2718,17 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; if (!nfs4_valid_open_stateid(state)) return -EBADF; - if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - &lockowner) == -EIO) + if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, + &arg.stateid, &delegation_cred) == -EIO) return -EBADF; } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); + if (delegation_cred) + msg.rpc_cred = delegation_cred; status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + + put_rpccred(delegation_cred); if (status == 0 && state != NULL) renew_lease(server, timestamp); trace_nfs4_setattr(inode, &arg.stateid, status); @@ -3777,7 +3799,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) { - nfs4_setup_sequence(NFS_SERVER(data->dir), + nfs4_setup_sequence(NFS_SB(data->dentry->d_sb), &data->args.seq_args, &data->res.seq_res, task); @@ -4285,7 +4307,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid, if (l_ctx != NULL) lockowner = &l_ctx->lockowner; - return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); + return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL); } EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); @@ -6054,6 +6076,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; int status = -ENOLCK; @@ -6068,6 +6091,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(state->inode, request); if (status < 0) goto out; + mutex_lock(&sp->so_delegreturn_mutex); down_read(&nfsi->rwsem); if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ @@ -6075,9 +6099,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock request->fl_flags = fl_flags & ~FL_SLEEP; status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; } up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: request->fl_flags = fl_flags; @@ -6263,10 +6289,10 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - void *buf, size_t buflen) + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) { - return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); + return nfs4_proc_get_acl(inode, buf, buflen); } static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) @@ -6288,11 +6314,11 @@ static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, } static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - void *buf, size_t buflen) + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) { if (security_ismaclabel(key)) - return nfs4_get_security_label(d_inode(dentry), buf, buflen); + return nfs4_get_security_label(inode, buf, buflen); return -EOPNOTSUPP; } @@ -7351,9 +7377,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) * always set csa_cachethis to FALSE because the current implementation * of the back channel DRC only supports caching the CB_SEQUENCE operation. */ -static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, + struct rpc_clnt *clnt) { unsigned int max_rqst_sz, max_resp_sz; + unsigned int max_bc_payload = rpc_max_bc_payload(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -7371,8 +7399,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.max_rqst_sz = PAGE_SIZE; - args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_rqst_sz = max_bc_payload; + args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; @@ -7476,7 +7504,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, }; int status; - nfs4_init_channel_attrs(&args); + nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); @@ -7820,40 +7848,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); - int ret; dprintk("--> %s\n", __func__); - /* Note the is a race here, where a CB_LAYOUTRECALL can come in - * right now covering the LAYOUTGET we are about to send. - * However, that is not so catastrophic, and there seems - * to be no way to prevent it completely. - */ - if (nfs41_setup_sequence(session, &lgp->args.seq_args, - &lgp->res.seq_res, task)) - return; - ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, - NFS_I(lgp->args.inode)->layout, - &lgp->args.range, - lgp->args.ctx->state); - if (ret < 0) - rpc_exit(task, ret); + nfs41_setup_sequence(session, &lgp->args.seq_args, + &lgp->res.seq_res, task); + dprintk("<-- %s\n", __func__); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + nfs41_sequence_done(task, &lgp->res.seq_res); + dprintk("<-- %s\n", __func__); +} + +static int +nfs4_layoutget_handle_exception(struct rpc_task *task, + struct nfs4_layoutget *lgp, struct nfs4_exception *exception) +{ struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; - struct nfs4_state *state = NULL; - unsigned long timeo, now, giveup; + int status = task->tk_status; dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); - if (!nfs41_sequence_done(task, &lgp->res.seq_res)) - goto out; - - switch (task->tk_status) { + switch (status) { case 0: goto out; @@ -7863,57 +7885,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * retry go inband. */ case -NFS4ERR_LAYOUTUNAVAILABLE: - task->tk_status = -ENODATA; + status = -ENODATA; goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). */ case -NFS4ERR_BADLAYOUT: - goto out_overflow; + status = -EOVERFLOW; + goto out; /* * NFS4ERR_LAYOUTTRYLATER is a conflict with another client * (or clients) writing to the same RAID stripe except when * the minlength argument is 0 (see RFC5661 section 18.43.3). + * + * Treat it like we would RECALLCONFLICT -- we retry for a little + * while, and then eventually give up. */ case -NFS4ERR_LAYOUTTRYLATER: - if (lgp->args.minlength == 0) - goto out_overflow; - /* - * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall - * existing layout before getting a new one). - */ - case -NFS4ERR_RECALLCONFLICT: - timeo = rpc_get_timeout(task->tk_client); - giveup = lgp->args.timestamp + timeo; - now = jiffies; - if (time_after(giveup, now)) { - unsigned long delay; - - /* Delay for: - * - Not less then NFS4_POLL_RETRY_MIN. - * - One last time a jiffie before we give up - * - exponential backoff (time_now minus start_attempt) - */ - delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, - min((giveup - now - 1), - now - lgp->args.timestamp)); - - dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", - __func__, delay); - rpc_delay(task, delay); - /* Do not call nfs4_async_handle_error() */ - goto out_restart; + if (lgp->args.minlength == 0) { + status = -EOVERFLOW; + goto out; } - break; + /* Fallthrough */ + case -NFS4ERR_RECALLCONFLICT: + nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, + exception); + status = -ERECALLCONFLICT; + goto out; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + exception->timeout = 0; spin_lock(&inode->i_lock); if (nfs4_stateid_match(&lgp->args.stateid, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); /* If the open stateid was bad, then recover it. */ - state = lgp->args.ctx->state; + exception->state = lgp->args.ctx->state; break; } lo = NFS_I(inode)->layout; @@ -7926,25 +7934,21 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * with the current stateid. */ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else spin_unlock(&inode->i_lock); - goto out_restart; + status = -EAGAIN; + goto out; } - if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) - goto out_restart; + + status = nfs4_handle_exception(server, status, exception); + if (exception->retry) + status = -EAGAIN; out: dprintk("<-- %s\n", __func__); - return; -out_restart: - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; -out_overflow: - task->tk_status = -EOVERFLOW; - goto out; + return status; } static size_t max_response_pages(struct nfs_server *server) @@ -8013,7 +8017,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); @@ -8033,6 +8037,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .flags = RPC_TASK_ASYNC, }; struct pnfs_layout_segment *lseg = NULL; + struct nfs4_exception exception = { .timeout = *timeout }; int status = 0; dprintk("--> %s\n", __func__); @@ -8046,7 +8051,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; - lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; @@ -8056,13 +8060,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) if (IS_ERR(task)) return ERR_CAST(task); status = nfs4_wait_for_completion_rpc_task(task); - if (status == 0) - status = task->tk_status; + if (status == 0) { + status = nfs4_layoutget_handle_exception(task, lgp, &exception); + *timeout = exception.timeout; + } + trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); @@ -8118,7 +8126,8 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); @@ -8653,6 +8662,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) static bool nfs41_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + if (s1->type != s2->type) + return false; + if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) return false; @@ -8793,6 +8805,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE + | NFS_CAP_COPY | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d854693a15b0..5075592df145 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -65,7 +65,10 @@ #define OPENOWNER_POOL_SIZE 8 -const nfs4_stateid zero_stateid; +const nfs4_stateid zero_stateid = { + .data = { 0 }, + .type = NFS4_SPECIAL_STATEID_TYPE, +}; static DEFINE_MUTEX(nfs_clid_init_mutex); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fmode_t fmode, const struct nfs_lockowner *lockowner) +int nfs4_select_rw_stateid(struct nfs4_state *state, + fmode_t fmode, const struct nfs_lockowner *lockowner, + nfs4_stateid *dst, struct rpc_cred **cred) { - int ret = nfs4_copy_lock_stateid(dst, state, lockowner); + int ret; + + if (cred != NULL) + *cred = NULL; + ret = nfs4_copy_lock_stateid(dst, state, lockowner); if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; /* returns true if delegation stateid found and copied */ - if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) { ret = 0; goto out; } diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2c8d05dae5b1..9c150b153782 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \ + { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) TRACE_EVENT(pnfs_update_layout, @@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout, u64 count, enum pnfs_iomode iomode, struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, enum pnfs_update_layout_reason reason ), - TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_ARGS(inode, pos, count, iomode, lo, lseg, reason), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, fileid) @@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout, __field(enum pnfs_iomode, iomode) __field(int, layoutstateid_seq) __field(u32, layoutstateid_hash) + __field(long, lseg) __field(enum pnfs_update_layout_reason, reason) ), TP_fast_assign( @@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout, __entry->layoutstateid_seq = 0; __entry->layoutstateid_hash = 0; } + __entry->lseg = (long)lseg; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " "iomode=%s pos=%llu count=%llu " - "layoutstateid=%d:0x%08x (%s)", + "layoutstateid=%d:0x%08x lseg=0x%lx (%s)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout, (unsigned long long)__entry->pos, (unsigned long long)__entry->count, __entry->layoutstateid_seq, __entry->layoutstateid_hash, + __entry->lseg, show_pnfs_update_layout_reason(__entry->reason) ) ); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4e4441216804..661e753fe1c9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); } +static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_OPEN_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LOCK_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) { int status; @@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) if (status == -EIO) goto out; if (status == 0) { - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); if (unlikely(status)) goto out; } else if (status == -NFS4ERR_DENIED) @@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); if (status == 0) - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); return status; } @@ -5001,7 +5019,7 @@ static int decode_space_limit(struct xdr_stream *xdr, blocksize = be32_to_cpup(p); maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } - maxsize >>= PAGE_CACHE_SHIFT; + maxsize >>= PAGE_SHIFT; *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; out_overflow: @@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, __be32 *p; int status; - status = decode_stateid(xdr, &res->delegation); + status = decode_delegation_stateid(xdr, &res->delegation); if (unlikely(status)) return status; p = xdr_inline_decode(xdr, 4); @@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) nfs_increment_open_seqid(status, res->seqid); if (status) return status; - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); if (unlikely(status)) return status; @@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5838,6 +5856,12 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) +static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_getdeviceinfo(struct xdr_stream *xdr, struct nfs4_getdeviceinfo_res *res) { @@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, if (unlikely(!p)) goto out_overflow; res->return_on_close = be32_to_cpup(p); - decode_stateid(xdr, &res->stateid); + decode_layout_stateid(xdr, &res->stateid); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr, goto out_overflow; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) - status = decode_stateid(xdr, &res->stateid); + status = decode_layout_stateid(xdr, &res->stateid); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -7515,6 +7539,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(DEALLOCATE, enc_deallocate, dec_deallocate), PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), PROC(CLONE, enc_clone, dec_clone), + PROC(COPY, enc_copy, dec_copy), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 9f80a086b612..0b9e5cc9a747 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -702,7 +702,7 @@ TRACE_EVENT(nfs_sillyrename_unlink, ), TP_fast_assign( - struct inode *dir = data->dir; + struct inode *dir = d_inode(data->dentry->d_parent); size_t len = data->args.name.len; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 9aebffb40505..049c1b1f2932 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -486,7 +486,7 @@ static void __r4w_put_page(void *priv, struct page *page) dprintk("%s: index=0x%lx\n", __func__, (page == ZERO_PAGE(0)) ? -1UL : page->index); if (ZERO_PAGE(0) != page) - page_cache_release(page); + put_page(page); return; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 8ce4f61cbaa5..174dd4cf5747 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - req->wb_index = page_file_index(page); - page_cache_get(page); + if (page) { + req->wb_index = page_file_index(page); + get_page(page); + } req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; @@ -392,7 +394,7 @@ static void nfs_clear_request(struct nfs_page *req) struct nfs_lock_context *l_ctx = req->wb_lock_context; if (page != NULL) { - page_cache_release(page); + put_page(page); req->wb_page = NULL; } if (l_ctx != NULL) { @@ -904,7 +906,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, return false; } else { if (req->wb_pgbase != 0 || - prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE) return false; } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2fa483e6dbe2..0c7e0d45a4de 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, }; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); + return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0); } static int @@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, @@ -522,13 +522,35 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, return rv; } -/* Returns count of number of matching invalid lsegs remaining in list - * after call. +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +/** + * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later + * @lo: layout header containing the lsegs + * @tmp_list: list head where doomed lsegs should go + * @recall_range: optional recall range argument to match (may be NULL) + * @seq: only invalidate lsegs obtained prior to this sequence (may be 0) + * + * Walk the list of lsegs in the layout header, and tear down any that should + * be destroyed. If "recall_range" is specified then the segment must match + * that range. If "seq" is non-zero, then only match segments that were handed + * out at or before that sequence. + * + * Returns number of matching invalid lsegs remaining in list after scanning + * it and purging them. */ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -540,10 +562,12 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (!recall_range || should_free_lseg(&lseg->pls_range, recall_range)) { - dprintk("%s: freeing lseg %p iomode %d " + if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) + continue; + dprintk("%s: freeing lseg %p iomode %d seq %u" "offset %llu length %llu\n", __func__, - lseg, lseg->pls_range.iomode, lseg->pls_range.offset, - lseg->pls_range.length); + lseg, lseg->pls_range.iomode, lseg->pls_seq, + lseg->pls_range.offset, lseg->pls_range.length); if (!mark_lseg_invalid(lseg, tmp_list)) remaining++; } @@ -730,15 +754,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } -/* - * Compare 2 layout stateid sequence ids, to see which is newer, - * taking into account wraparound issues. - */ -static bool pnfs_seqid_is_newer(u32 s1, u32 s2) -{ - return (s32)(s1 - s2) > 0; -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -781,50 +796,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } -int -pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state) -{ - int status = 0; - - dprintk("--> %s\n", __func__); - spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo)) { - status = -EAGAIN; - } else if (!nfs4_valid_open_stateid(open_state)) { - status = -EBADF; - } else if (list_empty(&lo->plh_segs) || - test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { - int seq; - - do { - seq = read_seqbegin(&open_state->seqlock); - nfs4_stateid_copy(dst, &open_state->stateid); - } while (read_seqretry(&open_state->seqlock, seq)); - } else - nfs4_stateid_copy(dst, &lo->plh_stateid); - spin_unlock(&lo->plh_inode->i_lock); - dprintk("<-- %s\n", __func__); - return status; -} - /* -* Get layout from server. -* for now, assume that whole file layouts are requested. -* arg->offset: 0 -* arg->length: all ones -*/ + * Get layout from server. + * for now, assume that whole file layouts are requested. + * arg->offset: 0 + * arg->length: all ones + */ static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, + nfs4_stateid *stateid, const struct pnfs_layout_range *range, - gfp_t gfp_flags) + long *timeout, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; - struct pnfs_layout_segment *lseg; loff_t i_size; dprintk("--> %s\n", __func__); @@ -834,40 +821,31 @@ send_layoutget(struct pnfs_layout_hdr *lo, * store in lseg. If we race with a concurrent seqid morphing * op, then re-send the LAYOUTGET. */ - do { - lgp = kzalloc(sizeof(*lgp), gfp_flags); - if (lgp == NULL) - return NULL; - - i_size = i_size_read(ino); - - lgp->args.minlength = PAGE_CACHE_SIZE; - if (lgp->args.minlength > range->length) - lgp->args.minlength = range->length; - if (range->iomode == IOMODE_READ) { - if (range->offset >= i_size) - lgp->args.minlength = 0; - else if (i_size - range->offset < lgp->args.minlength) - lgp->args.minlength = i_size - range->offset; - } - lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - pnfs_copy_range(&lgp->args.range, range); - lgp->args.type = server->pnfs_curr_ld->id; - lgp->args.inode = ino; - lgp->args.ctx = get_nfs_open_context(ctx); - lgp->gfp_flags = gfp_flags; - lgp->cred = lo->plh_lc_cred; - - lseg = nfs4_proc_layoutget(lgp, gfp_flags); - } while (lseg == ERR_PTR(-EAGAIN)); - - if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) - lseg = NULL; - else - pnfs_layout_clear_fail_bit(lo, - pnfs_iomode_to_fail_bit(range->iomode)); + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return ERR_PTR(-ENOMEM); - return lseg; + i_size = i_size_read(ino); + + lgp->args.minlength = PAGE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + pnfs_copy_range(&lgp->args.range, range); + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + nfs4_stateid_copy(&lgp->args.stateid, stateid); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; + + return nfs4_proc_layoutget(lgp, timeout, gfp_flags); } static void pnfs_clear_layoutcommit(struct inode *inode, @@ -899,6 +877,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; pnfs_get_layout_hdr(lo); clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); return true; @@ -969,6 +948,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) bool send; nfs4_stateid_copy(&stateid, &lo->plh_stateid); + stateid.seqid = cpu_to_be32(lo->plh_return_seq); iomode = lo->plh_return_iomode; send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); @@ -1012,7 +992,7 @@ _pnfs_return_layout(struct inode *ino) pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { struct pnfs_layout_range range = { @@ -1341,23 +1321,28 @@ out_existing: /* * iomode matching rules: - * iomode lseg match - * ----- ----- ----- - * ANY READ true - * ANY RW true - * RW READ false - * RW RW true - * READ READ true - * READ RW true + * iomode lseg strict match + * iomode + * ----- ----- ------ ----- + * ANY READ N/A true + * ANY RW N/A true + * RW READ N/A false + * RW RW N/A true + * READ READ N/A true + * READ RW true false + * READ RW false true */ static bool pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_range range1; if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || + (range->iomode != ls_range->iomode && + strict_iomode == true) || !pnfs_lseg_range_intersecting(ls_range, range)) return 0; @@ -1372,7 +1357,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, */ static struct pnfs_layout_segment * pnfs_find_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range) + struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_segment *lseg, *ret = NULL; @@ -1381,7 +1367,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && - pnfs_lseg_range_match(&lseg->pls_range, range)) { + pnfs_lseg_range_match(&lseg->pls_range, range, + strict_iomode)) { ret = pnfs_get_lseg(lseg); break; } @@ -1498,6 +1485,7 @@ pnfs_update_layout(struct inode *ino, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags) { struct pnfs_layout_range arg = { @@ -1505,27 +1493,30 @@ pnfs_update_layout(struct inode *ino, .offset = pos, .length = count, }; - unsigned pg_offset; + unsigned pg_offset, seq; struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = NULL; struct pnfs_layout_segment *lseg = NULL; + nfs4_stateid stateid; + long timeout = 0; + unsigned long giveup = jiffies + rpc_get_timeout(server->client); bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; } if (iomode == IOMODE_READ && i_size_read(ino) == 0) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; } if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; } @@ -1536,14 +1527,14 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; @@ -1551,14 +1542,34 @@ lookup_again: /* if LAYOUTGET already failed once we don't try again */ if (pnfs_layout_io_test_failed(lo, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; } - first = list_empty(&lo->plh_segs); - if (first) { - /* The first layoutget for the file. Need to serialize per + lseg = pnfs_find_lseg(lo, &arg, strict_iomode); + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); + goto out_unlock; + } + + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + goto out_unlock; + } + + /* + * Choose a stateid for the LAYOUTGET. If we don't have a layout + * stateid, or it has been invalidated, then we must use the open + * stateid. + */ + if (lo->plh_stateid.seqid == 0 || + test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + + /* + * The first layoutget for the file. Need to serialize per * RFC 5661 Errata 3208. */ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, @@ -1567,18 +1578,17 @@ lookup_again: wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); goto lookup_again; } + + first = true; + do { + seq = read_seqbegin(&ctx->state->seqlock); + nfs4_stateid_copy(&stateid, &ctx->state->stateid); + } while (read_seqretry(&ctx->state->seqlock, seq)); } else { - /* Check to see if the layout for the given range - * already exists - */ - lseg = pnfs_find_lseg(lo, &arg); - if (lseg) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, - PNFS_UPDATE_LAYOUT_FOUND_CACHED); - goto out_unlock; - } + nfs4_stateid_copy(&stateid, &lo->plh_stateid); } /* @@ -1593,15 +1603,17 @@ lookup_again: pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); dprintk("%s retrying\n", __func__); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg, PNFS_UPDATE_LAYOUT_RETRY); goto lookup_again; } - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } if (pnfs_layoutgets_blocked(lo)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; } @@ -1618,18 +1630,44 @@ lookup_again: spin_unlock(&clp->cl_lock); } - pg_offset = arg.offset & ~PAGE_CACHE_MASK; + pg_offset = arg.offset & ~PAGE_MASK; if (pg_offset) { arg.offset -= pg_offset; arg.length += pg_offset; } if (arg.length != NFS4_MAX_UINT64) - arg.length = PAGE_CACHE_ALIGN(arg.length); + arg.length = PAGE_ALIGN(arg.length); - lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - atomic_dec(&lo->plh_outstanding); - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); + if (IS_ERR(lseg)) { + switch(PTR_ERR(lseg)) { + case -ERECALLCONFLICT: + if (time_after(jiffies, giveup)) + lseg = NULL; + /* Fallthrough */ + case -EAGAIN: + pnfs_put_layout_hdr(lo); + if (first) + pnfs_clear_first_layoutget(lo); + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + goto lookup_again; + } + /* Fallthrough */ + default: + if (!nfs_error_is_fatal(PTR_ERR(lseg))) { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + } + } + } else { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + } + + atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1678,38 +1716,36 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; LIST_HEAD(free_me); - int status = -EINVAL; if (!pnfs_sanity_check_layout_range(&res->range)) - goto out; + return ERR_PTR(-EINVAL); /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); - if (!lseg || IS_ERR(lseg)) { + if (IS_ERR_OR_NULL(lseg)) { if (!lseg) - status = -ENOMEM; - else - status = PTR_ERR(lseg); - dprintk("%s: Could not allocate layout: error %d\n", - __func__, status); - goto out; + lseg = ERR_PTR(-ENOMEM); + + dprintk("%s: Could not allocate layout: error %ld\n", + __func__, PTR_ERR(lseg)); + return lseg; } init_lseg(lo, lseg); lseg->pls_range = res->range; + lseg->pls_seq = be32_to_cpu(res->stateid.seqid); spin_lock(&ino->i_lock); if (pnfs_layoutgets_blocked(lo)) { dprintk("%s forget reply due to state\n", __func__); - goto out_forget_reply; + goto out_forget; } if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); - status = -EAGAIN; - goto out_forget_reply; + goto out_forget; } pnfs_set_layout_stateid(lo, &res->stateid, false); } else { @@ -1718,7 +1754,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) * inode invalid, and don't bother validating the stateid * sequence number. */ - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); lo->plh_barrier = be32_to_cpu(res->stateid.seqid); @@ -1735,18 +1771,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me); return lseg; -out: - return ERR_PTR(status); -out_forget_reply: +out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - goto out; + return ERR_PTR(-EAGAIN); } static void -pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, + u32 seq) { if (lo->plh_return_iomode == iomode) return; @@ -1754,6 +1789,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) + lo->plh_return_seq = seq; } /** @@ -1769,7 +1806,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -1792,8 +1830,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - pnfs_set_plh_return_iomode(lo, return_range->iomode); } + + if (remaining) + pnfs_set_plh_return_info(lo, return_range->iomode, seq); + return remaining; } @@ -1810,13 +1851,14 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); - pnfs_set_plh_return_iomode(lo, range.iomode); + pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, + &range, lseg->pls_seq)) { nfs4_stateid stateid; enum pnfs_iomode iomode = lo->plh_return_iomode; @@ -1849,6 +1891,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r req_offset(req), rd_size, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1873,6 +1916,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, req_offset(req), wb_size, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -2143,12 +2187,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); - return nfs_pageio_resend(&pgio, hdr); + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + nfs_pageio_init_read(&pgio, hdr->inode, false, + hdr->completion_ops); + hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); + } } EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); @@ -2158,12 +2205,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; - int err = 0; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); if (trypnfs == PNFS_TRY_AGAIN) - err = pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || err) + pnfs_read_resend_pnfs(hdr); + if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) pnfs_read_through_mds(desc, hdr); } @@ -2405,7 +2451,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) spin_lock(&inode->i_lock); if (!NFS_I(inode)->layout) { spin_unlock(&inode->i_lock); - goto out; + goto out_clear_layoutstats; } hdr = NFS_I(inode)->layout; pnfs_get_layout_hdr(hdr); @@ -2434,6 +2480,7 @@ out_free: kfree(data); out_put: pnfs_put_layout_hdr(hdr); +out_clear_layoutstats: smp_mb__before_atomic(); clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); smp_mb__after_atomic(); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1ac1db5f6dad..b21bd0bee784 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -64,6 +64,7 @@ struct pnfs_layout_segment { struct list_head pls_lc_list; struct pnfs_layout_range pls_range; atomic_t pls_refcount; + u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; struct work_struct pls_work; @@ -194,6 +195,7 @@ struct pnfs_layout_hdr { unsigned long plh_flags; nfs4_stateid plh_stateid; u32 plh_barrier; /* ignore lower seqids */ + u32 plh_return_seq; enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ @@ -258,16 +260,14 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier); -int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, - struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -282,12 +282,13 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); -int pnfs_read_resend_pnfs(struct nfs_pgio_header *); +void pnfs_read_resend_pnfs(struct nfs_pgio_header *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags); void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 4aaed890048f..0dfc476da3e1 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. - * Note this must be called holding the inode (/cinfo) lock + * Note this must be called holding i_lock */ void pnfs_generic_clear_request_commit(struct nfs_page *req, @@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); @@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; @@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, { int i, rv = 0, cnt; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], cinfo, max); @@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct pnfs_layout_segment *freeme; int i; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (pnfs_generic_transfer_commit_list(&b->written, dst, cinfo, 0)) { freeme = b->wlseg; b->wlseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); goto restart; } } @@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) LIST_HEAD(pages); int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); for (i = idx; i < fl_cinfo->nbuckets; i++) { bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) @@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) freeme = bucket->clseg; bucket->clseg = NULL; list_splice_init(&bucket->committing, &pages); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_retry_commit(&pages, freeme, cinfo, i); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static unsigned int @@ -238,14 +238,31 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages, struct pnfs_commit_bucket *bucket; bucket = &cinfo->ds->buckets[data->ds_commit_index]; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); list_splice_init(&bucket->committing, pages); data->lseg = bucket->clseg; bucket->clseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } +/* Helper function for pnfs_generic_commit_pagelist to catch an empty + * page list. This can happen when two commits race. */ +static bool +pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, + struct nfs_commit_data *data, + struct nfs_commit_info *cinfo) +{ + if (list_empty(pages)) { + if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) + wake_up_atomic_t(&cinfo->mds->rpcs_out); + nfs_commitdata_release(data); + return true; + } + + return false; +} + /* This follows nfs_commit_list pretty closely */ int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, @@ -280,6 +297,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); if (data->ds_commit_index < 0) { + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages, + data, cinfo)) + continue; + nfs_init_commit(data, mds_pages, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), @@ -288,6 +310,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, LIST_HEAD(pages); pnfs_fetch_commit_bucket_list(&pages, data, cinfo); + + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(&pages, + data, cinfo)) + continue; + nfs_init_commit(data, &pages, data->lseg, cinfo); initiate_commit(data, how); } @@ -874,12 +902,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, struct list_head *list; struct pnfs_commit_bucket *buckets; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); cinfo->completion_ops->resched_write(cinfo, req); return; } @@ -896,7 +924,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb31e23e7def..6776d7a7839e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -46,7 +46,7 @@ static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) static int nfs_return_empty_page(struct page *page) { - zero_user(page, 0, PAGE_CACHE_SIZE); + zero_user(page, 0, PAGE_SIZE); SetPageUptodate(page); unlock_page(page); return 0; @@ -118,8 +118,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, unlock_page(page); return PTR_ERR(new); } - if (len < PAGE_CACHE_SIZE) - zero_user_segment(page, len, PAGE_CACHE_SIZE); + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); @@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page) int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_CACHE_SIZE, page_file_index(page)); + page, PAGE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); @@ -361,8 +361,8 @@ readpage_async_filler(void *data, struct page *page) if (IS_ERR(new)) goto out_error; - if (len < PAGE_CACHE_SIZE) - zero_user_segment(page, len, PAGE_CACHE_SIZE); + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); if (!nfs_pageio_add_request(desc->pgio, new)) { nfs_list_remove_request(new); nfs_readpage_release(new); @@ -424,8 +424,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, pgm = &pgio.pg_mirrors[0]; NFS_I(inode)->read_io += pgm->pg_bytes_written; - npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> + PAGE_SHIFT; nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: put_nfs_open_context(desc.ctx); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f1268280244e..2137e0202f25 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = { enum { Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, + Opt_xprt_rdma6, Opt_xprt_err }; @@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_tcp, "tcp" }, { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, + { Opt_xprt_rdma6, "rdma6" }, { Opt_xprt_err, NULL } }; @@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; + case Opt_xprt_rdma6: + protofamily = AF_INET6; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; @@ -2408,6 +2412,11 @@ static int nfs_compare_super_address(struct nfs_server *server1, struct nfs_server *server2) { struct sockaddr *sap1, *sap2; + struct rpc_xprt *xprt1 = server1->client->cl_xprt; + struct rpc_xprt *xprt2 = server2->client->cl_xprt; + + if (!net_eq(xprt1->xprt_net, xprt2->xprt_net)) + return 0; sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index fa538b2ba251..1868246f56e6 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -30,45 +30,11 @@ static void nfs_free_unlinkdata(struct nfs_unlinkdata *data) { - iput(data->dir); put_rpccred(data->cred); kfree(data->args.name.name); kfree(data); } -#define NAME_ALLOC_LEN(len) ((len+16) & ~15) -/** - * nfs_copy_dname - copy dentry name to data structure - * @dentry: pointer to dentry - * @data: nfs_unlinkdata - */ -static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data) -{ - char *str; - int len = dentry->d_name.len; - - str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL); - if (!str) - return -ENOMEM; - data->args.name.len = len; - data->args.name.name = str; - return 0; -} - -static void nfs_free_dname(struct nfs_unlinkdata *data) -{ - kfree(data->args.name.name); - data->args.name.name = NULL; - data->args.name.len = 0; -} - -static void nfs_dec_sillycount(struct inode *dir) -{ - struct nfs_inode *nfsi = NFS_I(dir); - if (atomic_dec_return(&nfsi->silly_count) == 1) - wake_up(&nfsi->waitqueue); -} - /** * nfs_async_unlink_done - Sillydelete post-processing * @task: rpc_task of the sillydelete @@ -78,7 +44,7 @@ static void nfs_dec_sillycount(struct inode *dir) static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) { struct nfs_unlinkdata *data = calldata; - struct inode *dir = data->dir; + struct inode *dir = d_inode(data->dentry->d_parent); trace_nfs_sillyrename_unlink(data, task->tk_status); if (!NFS_PROTO(dir)->unlink_done(task, dir)) @@ -95,17 +61,21 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) static void nfs_async_unlink_release(void *calldata) { struct nfs_unlinkdata *data = calldata; - struct super_block *sb = data->dir->i_sb; + struct dentry *dentry = data->dentry; + struct super_block *sb = dentry->d_sb; - nfs_dec_sillycount(data->dir); + up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem); + d_lookup_done(dentry); nfs_free_unlinkdata(data); + dput(dentry); nfs_sb_deactive(sb); } static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) { struct nfs_unlinkdata *data = calldata; - NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data); + struct inode *dir = d_inode(data->dentry->d_parent); + NFS_PROTO(dir)->unlink_rpc_prepare(task, data); } static const struct rpc_call_ops nfs_unlink_ops = { @@ -114,7 +84,7 @@ static const struct rpc_call_ops nfs_unlink_ops = { .rpc_call_prepare = nfs_unlink_prepare, }; -static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) +static void nfs_do_call_unlink(struct nfs_unlinkdata *data) { struct rpc_message msg = { .rpc_argp = &data->args, @@ -129,10 +99,31 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; + struct inode *dir = d_inode(data->dentry->d_parent); + nfs_sb_active(dir->i_sb); + data->args.fh = NFS_FH(dir); + nfs_fattr_init(data->res.dir_attr); + + NFS_PROTO(dir)->unlink_setup(&msg, dir); + + task_setup_data.rpc_client = NFS_CLIENT(dir); + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task_async(task); +} + +static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) +{ + struct inode *dir = d_inode(dentry->d_parent); struct dentry *alias; - alias = d_lookup(parent, &data->args.name); - if (alias != NULL) { + down_read_non_owner(&NFS_I(dir)->rmdir_sem); + alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq); + if (IS_ERR(alias)) { + up_read_non_owner(&NFS_I(dir)->rmdir_sem); + return 0; + } + if (!d_in_lookup(alias)) { int ret; void *devname_garbage = NULL; @@ -140,10 +131,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * Hey, we raced with lookup... See if we need to transfer * the sillyrename information to the aliased dentry. */ - nfs_free_dname(data); - ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (ret == 0 && d_really_is_positive(alias) && + if (d_really_is_positive(alias) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; @@ -152,8 +141,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n } else ret = 0; spin_unlock(&alias->d_lock); - nfs_dec_sillycount(dir); dput(alias); + up_read_non_owner(&NFS_I(dir)->rmdir_sem); /* * If we'd displaced old cached devname, free it. At that * point dentry is definitely not a root, so we won't need @@ -162,94 +151,18 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n kfree(devname_garbage); return ret; } - data->dir = igrab(dir); - if (!data->dir) { - nfs_dec_sillycount(dir); - return 0; - } - nfs_sb_active(dir->i_sb); - data->args.fh = NFS_FH(dir); - nfs_fattr_init(data->res.dir_attr); - - NFS_PROTO(dir)->unlink_setup(&msg, dir); - - task_setup_data.rpc_client = NFS_CLIENT(dir); - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task_async(task); + data->dentry = alias; + nfs_do_call_unlink(data); return 1; } -static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) -{ - struct dentry *parent; - struct inode *dir; - int ret = 0; - - - parent = dget_parent(dentry); - if (parent == NULL) - goto out_free; - dir = d_inode(parent); - /* Non-exclusive lock protects against concurrent lookup() calls */ - spin_lock(&dir->i_lock); - if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { - /* Deferred delete */ - hlist_add_head(&data->list, &NFS_I(dir)->silly_list); - spin_unlock(&dir->i_lock); - ret = 1; - goto out_dput; - } - spin_unlock(&dir->i_lock); - ret = nfs_do_call_unlink(parent, dir, data); -out_dput: - dput(parent); -out_free: - return ret; -} - -void nfs_wait_on_sillyrename(struct dentry *dentry) -{ - struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); - - wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); -} - -void nfs_block_sillyrename(struct dentry *dentry) -{ - struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); - - wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1); -} - -void nfs_unblock_sillyrename(struct dentry *dentry) -{ - struct inode *dir = d_inode(dentry); - struct nfs_inode *nfsi = NFS_I(dir); - struct nfs_unlinkdata *data; - - atomic_inc(&nfsi->silly_count); - spin_lock(&dir->i_lock); - while (!hlist_empty(&nfsi->silly_list)) { - if (!atomic_inc_not_zero(&nfsi->silly_count)) - break; - data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list); - hlist_del(&data->list); - spin_unlock(&dir->i_lock); - if (nfs_do_call_unlink(dentry, dir, data) == 0) - nfs_free_unlinkdata(data); - spin_lock(&dir->i_lock); - } - spin_unlock(&dir->i_lock); -} - /** * nfs_async_unlink - asynchronous unlinking of a file * @dir: parent directory of dentry * @dentry: dentry to unlink */ static int -nfs_async_unlink(struct inode *dir, struct dentry *dentry) +nfs_async_unlink(struct dentry *dentry, struct qstr *name) { struct nfs_unlinkdata *data; int status = -ENOMEM; @@ -258,13 +171,18 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) goto out; + data->args.name.name = kstrdup(name->name, GFP_KERNEL); + if (!data->args.name.name) + goto out_free; + data->args.name.len = name->len; data->cred = rpc_lookup_cred(); if (IS_ERR(data->cred)) { status = PTR_ERR(data->cred); - goto out_free; + goto out_free_name; } data->res.dir_attr = &data->dir_attr; + init_waitqueue_head(&data->wq); status = -EBUSY; spin_lock(&dentry->d_lock); @@ -284,6 +202,8 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) out_unlock: spin_unlock(&dentry->d_lock); put_rpccred(data->cred); +out_free_name: + kfree(data->args.name.name); out_free: kfree(data); out: @@ -302,17 +222,15 @@ out: void nfs_complete_unlink(struct dentry *dentry, struct inode *inode) { - struct nfs_unlinkdata *data = NULL; + struct nfs_unlinkdata *data; spin_lock(&dentry->d_lock); - if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { - dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; - data = dentry->d_fsdata; - dentry->d_fsdata = NULL; - } + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + data = dentry->d_fsdata; + dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); - if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) + if (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)) nfs_free_unlinkdata(data); } @@ -559,18 +477,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) /* queue unlink first. Can't do this from rpc_release as it * has to allocate memory */ - error = nfs_async_unlink(dir, dentry); + error = nfs_async_unlink(dentry, &sdentry->d_name); if (error) goto out_dput; - /* populate unlinkdata with the right dname */ - error = nfs_copy_dname(sdentry, - (struct nfs_unlinkdata *)dentry->d_fsdata); - if (error) { - nfs_cancel_async_unlink(dentry); - goto out_dput; - } - /* run the rename task, undo unlink if it fails */ task = nfs_async_rename(dir, dir, dentry, sdentry, nfs_complete_sillyrename); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5754835a2886..e1c74d3db64d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -150,7 +150,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c spin_lock(&inode->i_lock); i_size = i_size_read(inode); - end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + end_index = (i_size - 1) >> PAGE_SHIFT; if (i_size > 0 && page_file_index(page) < end_index) goto out; end = page_file_offset(page) + ((loff_t)offset+count); @@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req) static int wb_priority(struct writeback_control *wbc) { int ret = 0; - if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_COND_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; return ret; @@ -737,7 +736,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) head = req->wb_head; spin_lock(&inode->i_lock); - if (likely(!PageSwapCache(head->wb_page))) { + if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); smp_mb__after_atomic(); @@ -759,7 +758,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) static void nfs_mark_request_dirty(struct nfs_page *req) { - __set_page_dirty_nobuffers(req->wb_page); + if (req->wb_page) + __set_page_dirty_nobuffers(req->wb_page); } /* @@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold the cinfo->lock, and the nfs_page lock. + * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -832,10 +832,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(cinfo->lock); - nfs_mark_page_unstable(req->wb_page, cinfo); + spin_unlock(&cinfo->inode->i_lock); + if (req->wb_page) + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -864,7 +865,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode) { - cinfo->lock = &inode->i_lock; + cinfo->inode = inode; cinfo->mds = &NFS_I(inode)->commit_info; cinfo->ds = pnfs_get_ds_info(inode); cinfo->dreq = NULL; @@ -967,7 +968,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo) return cinfo->mds->ncommit; } -/* cinfo->lock held by caller */ +/* cinfo->inode->i_lock held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -979,7 +980,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); @@ -1005,7 +1006,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->mds->ncommit > 0) { const int max = INT_MAX; @@ -1013,7 +1014,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return ret; } @@ -1709,6 +1710,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, { struct nfs_commit_data *data; + /* another commit raced with us */ + if (list_empty(head)) + return 0; + data = nfs_commitdata_alloc(); if (!data) @@ -1724,6 +1729,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, return -ENOMEM; } +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) +{ + struct inode *inode = file_inode(file); + struct nfs_open_context *open; + struct nfs_commit_info cinfo; + struct nfs_page *req; + int ret; + + open = get_nfs_open_context(nfs_file_open_context(file)); + req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out_put; + } + + nfs_init_cinfo_from_inode(&cinfo, inode); + + memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); + nfs_request_add_commit_list(req, &cinfo); + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret > 0) + ret = 0; + + nfs_free_request(req); +out_put: + put_nfs_open_context(open); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_commit_file); + /* * COMMIT call returned */ @@ -1748,7 +1783,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - nfs_clear_page_commit(req->wb_page); + if (req->wb_page) + nfs_clear_page_commit(req->wb_page); dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, @@ -1942,7 +1978,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) { loff_t range_start = page_file_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); + loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 51c3b06e8036..d818e4ffd79f 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -552,7 +552,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, * different read/write sizes for file systems known to have * problems with large blocks */ if (nfserr == 0) { - struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; + struct super_block *sb = argp->fh.fh_dentry->d_sb; /* Note that we don't care for remote fs's here */ if (sb->s_magic == MSDOS_SUPER_MAGIC) { @@ -588,7 +588,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); if (nfserr == 0) { - struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; + struct super_block *sb = argp->fh.fh_dentry->d_sb; /* Note that we don't care for remote fs's here */ switch (sb->s_magic) { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2246454dec76..dba2ff8eaa68 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -146,7 +146,7 @@ static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) default: case FSIDSOURCE_DEV: p = xdr_encode_hyper(p, (u64)huge_encode_dev - (d_inode(fhp->fh_dentry)->i_sb->s_dev)); + (fhp->fh_dentry->d_sb->s_dev)); break; case FSIDSOURCE_FSID: p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); @@ -379,7 +379,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, */ hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + + rqstp->rq_arg.tail[0].iov_len - hdr; /* * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 825c7bc8d789..953c0755cb37 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -289,7 +289,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, status = nfserr_bad_stateid; mutex_lock(&ls->ls_mutex); - if (stateid->si_generation > stid->sc_stateid.si_generation) + if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid)) goto out_unlock_stid; if (layout_type != ls->ls_layout_type) goto out_unlock_stid; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0462eeddfff9..f5f82e145018 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4651,12 +4651,6 @@ grace_disallows_io(struct net *net, struct inode *inode) return opens_in_grace(net) && mandatory_lock(inode); } -/* Returns true iff a is later than b: */ -static bool stateid_generation_after(stateid_t *a, stateid_t *b) -{ - return (s32)(a->si_generation - b->si_generation) > 0; -} - static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* @@ -4670,7 +4664,7 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ - if (stateid_generation_after(in, ref)) + if (nfsd4_stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* * However, we could see a stateid from the past, even from a diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c1681ce894c5..a8919444c460 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -426,7 +426,7 @@ static bool is_root_export(struct svc_export *exp) static struct super_block *exp_sb(struct svc_export *exp) { - return d_inode(exp->ex_path.dentry)->i_sb; + return exp->ex_path.dentry->d_sb; } static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c050c53036a6..986e51e5ceac 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -573,6 +573,11 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_SEQUENCE, }; +/* Returns true iff a is later than b: */ +static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)(a->si_generation - b->si_generation) > 0; +} struct nfsd4_compound_state; struct nfsd_net; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d40010e4f1a9..6fbd81ecb410 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -935,8 +935,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; loff_t pos = offset; - loff_t end = LLONG_MAX; unsigned int pflags = current->flags; + int flags = 0; if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* @@ -955,9 +955,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (!EX_ISSYNC(exp)) stable = 0; + if (stable && !use_wgather) + flags |= RWF_SYNC; + /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, flags); set_fs(oldfs); if (host_err < 0) goto out_nfserr; @@ -965,15 +968,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += host_err; fsnotify_modify(file); - if (stable) { - if (use_wgather) { - host_err = wait_for_concurrent_writes(file); - } else { - if (*cnt) - end = offset + *cnt - 1; - host_err = vfs_fsync_range(file, offset, end, 0); - } - } + if (stable && use_wgather) + host_err = wait_for_concurrent_writes(file); out_nfserr: dprintk("nfsd: write complete host_err=%d\n", host_err); diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 2ccbf5531554..1a85d94f5b25 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Original code was written by Koji Sato <koji@osrg.net>. - * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, - * Amagai Yoshiji <amagai@osrg.net>. + * Originally written by Koji Sato. + * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #include <linux/types.h> @@ -58,7 +53,7 @@ nilfs_palloc_groups_count(const struct inode *inode) * @inode: inode of metadata file using this allocator * @entry_size: size of the persistent object */ -int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) +int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); @@ -73,13 +68,17 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) mi->mi_blocks_per_group = DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), mi->mi_entries_per_block) + 1; - /* Number of blocks in a group including entry blocks and - a bitmap block */ + /* + * Number of blocks in a group including entry blocks + * and a bitmap block + */ mi->mi_blocks_per_desc_block = nilfs_palloc_groups_per_desc_block(inode) * mi->mi_blocks_per_group + 1; - /* Number of blocks per descriptor including the - descriptor block */ + /* + * Number of blocks per descriptor including the + * descriptor block + */ return 0; } @@ -389,7 +388,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, - unsigned bsize, + unsigned int bsize, spinlock_t *lock) { int pos, end = bsize; @@ -624,7 +623,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu\n", + "entry number %llu already freed: ino=%lu", (unsigned long long)req->pr_entry_nr, (unsigned long)inode->i_ino); else @@ -665,7 +664,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu\n", + "entry number %llu already freed: ino=%lu", (unsigned long long)req->pr_entry_nr, (unsigned long)inode->i_ino); else @@ -740,8 +739,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned long group, group_offset; __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); - const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block; - unsigned entry_start, end, pos; + const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block; + unsigned int entry_start, end, pos; spinlock_t *lock; int i, j, k, ret; u32 nfree; @@ -774,7 +773,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu\n", + "entry number %llu already freed: ino=%lu", (unsigned long long)entry_nrs[j], (unsigned long)inode->i_ino); } else { @@ -819,7 +818,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) last_nrs[k]); if (ret && ret != -ENOENT) { nilfs_warning(inode->i_sb, __func__, - "failed to delete block of entry %llu: ino=%lu, err=%d\n", + "failed to delete block of entry %llu: ino=%lu, err=%d", (unsigned long long)last_nrs[k], (unsigned long)inode->i_ino, ret); } @@ -838,7 +837,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) ret = nilfs_palloc_delete_bitmap_block(inode, group); if (ret && ret != -ENOENT) { nilfs_warning(inode->i_sb, __func__, - "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n", + "failed to delete bitmap block of group %lu: ino=%lu, err=%d", group, (unsigned long)inode->i_ino, ret); } diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 6e6f49aa53df..05149e606a78 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Original code was written by Koji Sato <koji@osrg.net>. - * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, - * Amagai Yoshiji <amagai@osrg.net>. + * Originally written by Koji Sato. + * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #ifndef _NILFS_ALLOC_H @@ -42,7 +37,7 @@ nilfs_palloc_entries_per_group(const struct inode *inode) return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); } -int nilfs_palloc_init_blockgroup(struct inode *, unsigned); +int nilfs_palloc_init_blockgroup(struct inode *, unsigned int); int nilfs_palloc_get_entry_block(struct inode *, __u64, int, struct buffer_head **); void *nilfs_palloc_block_get_entry(const struct inode *, __u64, diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 27f75bcbeb30..f2a7877e0c8c 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/fs.h> @@ -46,7 +42,7 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, if (err == -EINVAL) { nilfs_error(inode->i_sb, fname, - "broken bmap (inode number=%lu)\n", inode->i_ino); + "broken bmap (inode number=%lu)", inode->i_ino); err = -EIO; } return err; @@ -97,7 +93,7 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, } int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, - unsigned maxblocks) + unsigned int maxblocks) { int ret; @@ -458,7 +454,7 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, struct buffer_head *pbh; __u64 key; - key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - + key = page_index(bh->b_page) << (PAGE_SHIFT - bmap->b_inode->i_blkbits); for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) key++; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index bfa817ce40b3..b6a4c8f93ac8 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_BMAP_H @@ -61,7 +57,7 @@ struct nilfs_bmap_stats { struct nilfs_bmap_operations { int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *, - unsigned); + unsigned int); int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); int (*bop_delete)(struct nilfs_bmap *, __u64); void (*bop_clear)(struct nilfs_bmap *); @@ -126,10 +122,14 @@ struct nilfs_bmap { /* pointer type */ #define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */ -#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single - version) */ -#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple - versions) */ +#define NILFS_BMAP_PTR_VS 1 /* + * virtual block number (single + * version) + */ +#define NILFS_BMAP_PTR_VM 2 /* + * virtual block number (has multiple + * versions) + */ #define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */ #define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0) @@ -154,7 +154,7 @@ struct nilfs_bmap_store { int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); -int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); +int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int); int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec); int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key); int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index a35ae35e6932..0576033699bc 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * This file was originally written by Seiji Kihara <kihara@osrg.net> - * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for - * stabilization and simplification. + * Originally written by Seiji Kihara. + * Fully revised by Ryusuke Konishi for stabilization and simplification. * */ @@ -62,7 +57,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) set_buffer_uptodate(bh); unlock_page(bh->b_page); - page_cache_release(bh->b_page); + put_page(bh->b_page); return bh; } @@ -128,7 +123,7 @@ found: out_locked: unlock_page(page); - page_cache_release(page); + put_page(page); return err; } @@ -146,7 +141,7 @@ void nilfs_btnode_delete(struct buffer_head *bh) pgoff_t index = page_index(page); int still_dirty; - page_cache_get(page); + get_page(page); lock_page(page); wait_on_page_writeback(page); @@ -154,7 +149,7 @@ void nilfs_btnode_delete(struct buffer_head *bh) still_dirty = PageDirty(page); mapping = page->mapping; unlock_page(page); - page_cache_release(page); + put_page(page); if (!still_dirty && mapping) invalidate_inode_pages2_range(mapping, index, index); @@ -181,7 +176,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, obh = ctxt->bh; ctxt->newbh = NULL; - if (inode->i_blkbits == PAGE_CACHE_SHIFT) { + if (inode->i_blkbits == PAGE_SHIFT) { lock_page(obh->b_page); /* * We cannot call radix_tree_preload for the kernels older diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index d876b565ce64..2cc1b80e18f7 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Seiji Kihara <kihara@osrg.net> - * Revised by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Seiji Kihara. + * Revised by Ryusuke Konishi. */ #ifndef _NILFS_BTNODE_H diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 3a3821b00486..eccb1c89ccbb 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/slab.h> @@ -689,7 +685,8 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *btree, } static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, - __u64 key, __u64 *ptrp, unsigned maxblocks) + __u64 key, __u64 *ptrp, + unsigned int maxblocks) { struct nilfs_btree_path *path; struct nilfs_btree_node *node; @@ -1032,12 +1029,12 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree, if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; - else { - ptr = nilfs_btree_find_near(btree, path); - if (ptr != NILFS_BMAP_INVALID_PTR) - /* near */ - return ptr; - } + + ptr = nilfs_btree_find_near(btree, path); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* near */ + return ptr; + /* block group */ return nilfs_bmap_find_target_in_group(btree); } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 22c02e35b6ef..df1a25faa83b 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_BTREE_H diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index b6596cab9e99..8a3d3b65af3f 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/kernel.h> @@ -41,6 +37,7 @@ static unsigned long nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) { __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); return (unsigned long)tcno; } @@ -50,6 +47,7 @@ static unsigned long nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) { __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); } @@ -433,7 +431,8 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, } static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, - void *buf, unsigned cisz, size_t nci) + void *buf, unsigned int cisz, + size_t nci) { struct nilfs_checkpoint *cp; struct nilfs_cpinfo *ci = buf; @@ -484,7 +483,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, } static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, - void *buf, unsigned cisz, size_t nci) + void *buf, unsigned int cisz, + size_t nci) { struct buffer_head *bh; struct nilfs_cpfile_header *header; @@ -570,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, */ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, - void *buf, unsigned cisz, size_t nci) + void *buf, unsigned int cisz, size_t nci) { switch (mode) { case NILFS_CHECKPOINT: @@ -870,8 +870,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) void *kaddr; int ret; - /* CP number is invalid if it's zero or larger than the - largest exist one.*/ + /* + * CP number is invalid if it's zero or larger than the + * largest existing one. + */ if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) return -ENOENT; down_read(&NILFS_MDT(cpfile)->mi_sem); diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index a242b9a314f9..0249744ae234 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_CPFILE_H @@ -37,8 +33,8 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); int nilfs_cpfile_is_snapshot(struct inode *, __u64); int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); -ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, - size_t); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, + unsigned int, size_t); int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 7dc23f100e57..7367610ea807 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/types.h> @@ -428,7 +424,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) return ret; } -ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz, +ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, size_t nvi) { struct buffer_head *entry_bh; diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index cbd8e9732503..abbfdabcabea 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_DAT_H @@ -51,7 +47,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, int nilfs_dat_mark_dirty(struct inode *, __u64); int nilfs_dat_freev(struct inode *, __u64 *, size_t); int nilfs_dat_move(struct inode *, __u64, sector_t); -ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned int, size_t); int nilfs_dat_read(struct super_block *sb, size_t entry_size, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 6b8b92b19cec..e506f4f7120a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net> + * Modified for NILFS by Amagai Yoshiji. */ /* * linux/fs/ext2/dir.c @@ -50,7 +46,7 @@ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be * more robust, but we have what we have */ -static inline unsigned nilfs_chunk_size(struct inode *inode) +static inline unsigned int nilfs_chunk_size(struct inode *inode) { return inode->i_sb->s_blocksize; } @@ -58,37 +54,39 @@ static inline unsigned nilfs_chunk_size(struct inode *inode) static inline void nilfs_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. */ -static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) +static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr) { - unsigned last_byte = inode->i_size; + unsigned int last_byte = inode->i_size; - last_byte -= page_nr << PAGE_CACHE_SHIFT; - if (last_byte > PAGE_CACHE_SIZE) - last_byte = PAGE_CACHE_SIZE; + last_byte -= page_nr << PAGE_SHIFT; + if (last_byte > PAGE_SIZE) + last_byte = PAGE_SIZE; return last_byte; } -static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to) +static int nilfs_prepare_chunk(struct page *page, unsigned int from, + unsigned int to) { loff_t pos = page_offset(page) + from; + return __block_write_begin(page, pos, to - from, nilfs_get_block); } static void nilfs_commit_chunk(struct page *page, struct address_space *mapping, - unsigned from, unsigned to) + unsigned int from, unsigned int to) { struct inode *dir = mapping->host; loff_t pos = page_offset(page) + from; - unsigned len = to - from; - unsigned nr_dirty, copied; + unsigned int len = to - from; + unsigned int nr_dirty, copied; int err; nr_dirty = nilfs_page_count_clean_buffers(page, from, to); @@ -102,19 +100,19 @@ static void nilfs_commit_chunk(struct page *page, unlock_page(page); } -static void nilfs_check_page(struct page *page) +static bool nilfs_check_page(struct page *page) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; - unsigned chunk_size = nilfs_chunk_size(dir); + unsigned int chunk_size = nilfs_chunk_size(dir); char *kaddr = page_address(page); - unsigned offs, rec_len; - unsigned limit = PAGE_CACHE_SIZE; + unsigned int offs, rec_len; + unsigned int limit = PAGE_SIZE; struct nilfs_dir_entry *p; char *error; - if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_CACHE_MASK; + if ((dir->i_size >> PAGE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_MASK; if (limit & (chunk_size - 1)) goto Ebadsize; if (!limit) @@ -137,7 +135,7 @@ static void nilfs_check_page(struct page *page) goto Eend; out: SetPageChecked(page); - return; + return true; /* Too bad, we had an error */ @@ -161,7 +159,7 @@ Espan: bad_entry: nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, (unsigned long) le64_to_cpu(p->inode), rec_len, p->name_len); goto fail; @@ -170,11 +168,11 @@ Eend: nilfs_error(sb, "nilfs_check_page", "entry in directory #%lu spans the page boundary" "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + dir->i_ino, (page->index<<PAGE_SHIFT)+offs, (unsigned long) le64_to_cpu(p->inode)); fail: - SetPageChecked(page); SetPageError(page); + return false; } static struct page *nilfs_get_page(struct inode *dir, unsigned long n) @@ -184,10 +182,10 @@ static struct page *nilfs_get_page(struct inode *dir, unsigned long n) if (!IS_ERR(page)) { kmap(page); - if (!PageChecked(page)) - nilfs_check_page(page); - if (PageError(page)) - goto fail; + if (unlikely(!PageChecked(page))) { + if (PageError(page) || !nilfs_check_page(page)) + goto fail; + } } return page; @@ -256,10 +254,9 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) loff_t pos = ctx->pos; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - unsigned int offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned int offset = pos & ~PAGE_MASK; + unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); -/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) return 0; @@ -272,7 +269,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) if (IS_ERR(page)) { nilfs_error(sb, __func__, "bad page in #%lu", inode->i_ino); - ctx->pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_SIZE - offset; return -EIO; } kaddr = page_address(page); @@ -321,7 +318,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, { const unsigned char *name = qstr->name; int namelen = qstr->len; - unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); struct page *page = NULL; @@ -340,6 +337,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, n = start; do { char *kaddr; + page = nilfs_get_page(dir, n); if (!IS_ERR(page)) { kaddr = page_address(page); @@ -361,7 +359,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, if (++n >= npages) n = 0; /* next page is past the blocks we've got */ - if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { nilfs_error(dir->i_sb, __func__, "dir %lu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, @@ -401,7 +399,7 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) if (de) { res = le64_to_cpu(de->inode); kunmap(page); - page_cache_release(page); + put_page(page); } return res; } @@ -410,8 +408,8 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct page *page, struct inode *inode) { - unsigned from = (char *) de - (char *) page_address(page); - unsigned to = from + nilfs_rec_len_from_disk(de->rec_len); + unsigned int from = (char *)de - (char *)page_address(page); + unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len); struct address_space *mapping = page->mapping; int err; @@ -433,15 +431,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned chunk_size = nilfs_chunk_size(dir); - unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned int chunk_size = nilfs_chunk_size(dir); + unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; struct page *page = NULL; struct nilfs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; - unsigned from, to; + unsigned int from, to; int err; /* @@ -460,7 +458,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) kaddr = page_address(page); dir_end = kaddr + nilfs_last_byte(dir, n); de = (struct nilfs_dir_entry *)kaddr; - kaddr += PAGE_CACHE_SIZE - reclen; + kaddr += PAGE_SIZE - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -533,13 +531,14 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; char *kaddr = page_address(page); - unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); - unsigned to = ((char *)dir - kaddr) + - nilfs_rec_len_from_disk(dir->rec_len); - struct nilfs_dir_entry *pde = NULL; - struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); + unsigned int from, to; + struct nilfs_dir_entry *de, *pde = NULL; int err; + from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); + to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len); + de = (struct nilfs_dir_entry *)(kaddr + from); + while ((char *)de < (char *)dir) { if (de->rec_len == 0) { nilfs_error(inode->i_sb, __func__, @@ -572,7 +571,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) { struct address_space *mapping = inode->i_mapping; struct page *page = grab_cache_page(mapping, 0); - unsigned chunk_size = nilfs_chunk_size(inode); + unsigned int chunk_size = nilfs_chunk_size(inode); struct nilfs_dir_entry *de; int err; void *kaddr; @@ -603,7 +602,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) kunmap_atomic(kaddr); nilfs_commit_chunk(page, mapping, 0, chunk_size); fail: - page_cache_release(page); + put_page(page); return err; } @@ -630,8 +629,8 @@ int nilfs_empty_dir(struct inode *inode) while ((char *)de <= kaddr) { if (de->rec_len == 0) { nilfs_error(inode->i_sb, __func__, - "zero-length directory entry " - "(kaddr=%p, de=%p)\n", kaddr, de); + "zero-length directory entry (kaddr=%p, de=%p)", + kaddr, de); goto not_empty; } if (de->inode != 0) { @@ -661,7 +660,7 @@ not_empty: const struct file_operations nilfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = nilfs_readdir, + .iterate_shared = nilfs_readdir, .unlocked_ioctl = nilfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index ebf89fd8ac1a..251a44928405 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/errno.h> @@ -62,7 +58,7 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *direct, static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, __u64 key, __u64 *ptrp, - unsigned maxblocks) + unsigned int maxblocks) { struct inode *dat = NULL; __u64 ptr, ptr2; @@ -83,7 +79,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, ptr = blocknr; } - maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1); + maxblocks = min_t(unsigned int, maxblocks, + NILFS_DIRECT_KEY_MAX - key + 1); for (cnt = 1; cnt < maxblocks && (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) != NILFS_BMAP_INVALID_PTR; @@ -110,9 +107,9 @@ nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key) if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; - else - /* block group */ - return nilfs_bmap_find_target_in_group(direct); + + /* block group */ + return nilfs_bmap_find_target_in_group(direct); } static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index dc643de20a25..3015a6e78724 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_DIRECT_H diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h index 19ccbf9522ab..00107fdb9343 100644 --- a/fs/nilfs2/export.h +++ b/fs/nilfs2/export.h @@ -20,6 +20,6 @@ struct nilfs_fid { u32 parent_gen; u64 parent_ino; -} __attribute__ ((packed)); +} __packed; #endif diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 088ba001c6ef..547381f3ce13 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Amagai Yoshiji <amagai@osrg.net>, - * Ryusuke Konishi <ryusuke@osrg.net> + * Written by Amagai Yoshiji and Ryusuke Konishi. */ #include <linux/fs.h> diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 748ca238915a..693aded72498 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, - * and Ryusuke Konishi <ryusuke@osrg.net>. - * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * Written by Seiji Kihara, Amagai Yoshiji, and Ryusuke Konishi. + * Revised by Ryusuke Konishi. * */ /* @@ -115,7 +110,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, failed: unlock_page(bh->b_page); - page_cache_release(bh->b_page); + put_page(bh->b_page); return err; } diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 6548c7851b48..1d2b1805327a 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Amagai Yoshiji <amagai@osrg.net>. - * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * Written by Amagai Yoshiji. + * Revised by Ryusuke Konishi. * */ @@ -68,8 +64,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, struct nilfs_palloc_req req; int ret; - req.pr_entry_nr = 0; /* 0 says find free inode from beginning of - a group. dull code!! */ + req.pr_entry_nr = 0; /* + * 0 says find free inode from beginning + * of a group. dull code!! + */ req.pr_entry_bh = NULL; ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 679674d13372..23ad2f091e76 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Amagai Yoshiji <amagai@osrg.net> - * Revised by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Amagai Yoshiji. + * Revised by Ryusuke Konishi. * */ @@ -36,6 +32,7 @@ static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) { void *kaddr = kmap(ibh->b_page); + return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 21a1e2e0d92f..a0ebdb17e912 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -87,7 +83,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 blknum = 0; int err = 0, ret; - unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; + unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits; down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); @@ -133,11 +129,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, /* Error handling should be detailed */ set_buffer_new(bh_result); set_buffer_delay(bh_result); - map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed - to proper value */ + map_bh(bh_result, inode->i_sb, 0); + /* Disk block number must be changed to proper value */ + } else if (ret == -ENOENT) { - /* not found is not error (e.g. hole); must return without - the mapped state flag. */ + /* + * not found is not error (e.g. hole); must return without + * the mapped state flag. + */ ; } else { err = ret; @@ -167,7 +166,7 @@ static int nilfs_readpage(struct file *file, struct page *page) * @nr_pages - number of pages to be read */ static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct list_head *pages, unsigned int nr_pages) { return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); } @@ -226,7 +225,7 @@ static int nilfs_set_page_dirty(struct page *page) int ret = __set_page_dirty_nobuffers(page); if (page_has_buffers(page)) { - unsigned nr_dirty = 0; + unsigned int nr_dirty = 0; struct buffer_head *bh, *head; /* @@ -249,7 +248,7 @@ static int nilfs_set_page_dirty(struct page *page) if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); } else if (ret) { - unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); + unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); nilfs_set_file_dirty(inode, nr_dirty); } @@ -291,8 +290,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - unsigned nr_dirty; + unsigned int start = pos & (PAGE_SIZE - 1); + unsigned int nr_dirty; int err; nr_dirty = nilfs_page_count_clean_buffers(page, start, @@ -305,7 +304,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, } static ssize_t -nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); @@ -313,7 +312,7 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) return 0; /* Needs synchronization with the cleaner */ - return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); + return blockdev_direct_IO(iocb, inode, iter, nilfs_get_block); } const struct address_space_operations nilfs_aops = { @@ -399,23 +398,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) err = nilfs_init_acl(inode, dir); if (unlikely(err)) - goto failed_after_creation; /* never occur. When supporting - nilfs_init_acl(), proper cancellation of - above jobs should be considered */ + /* + * Never occur. When supporting nilfs_init_acl(), + * proper cancellation of above jobs should be considered. + */ + goto failed_after_creation; return inode; failed_after_creation: clear_nlink(inode); unlock_new_inode(inode); - iput(inode); /* raw_inode will be deleted through - nilfs_evict_inode() */ + iput(inode); /* + * raw_inode will be deleted through + * nilfs_evict_inode(). + */ goto failed; failed_ifile_create_inode: make_bad_inode(inode); - iput(inode); /* if i_nlink == 1, generic_forget_inode() will be - called */ + iput(inode); failed: return ERR_PTR(err); } @@ -666,8 +668,10 @@ void nilfs_write_inode_common(struct inode *inode, else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_device_code = cpu_to_le64(huge_encode_dev(inode->i_rdev)); - /* When extending inode, nilfs->ns_inode_size should be checked - for substitutions of appended fields */ + /* + * When extending inode, nilfs->ns_inode_size should be checked + * for substitutions of appended fields. + */ } void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) @@ -685,9 +689,12 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) set_bit(NILFS_I_INODE_SYNC, &ii->i_state); nilfs_write_inode_common(inode, raw_inode, 0); - /* XXX: call with has_bmap = 0 is a workaround to avoid - deadlock of bmap. This delays update of i_bmap to just - before writing */ + /* + * XXX: call with has_bmap = 0 is a workaround to avoid + * deadlock of bmap. This delays update of i_bmap to just + * before writing. + */ + nilfs_ifile_unmap_inode(ifile, ino, ibh); } @@ -752,14 +759,15 @@ void nilfs_truncate(struct inode *inode) nilfs_mark_inode_dirty(inode); nilfs_set_file_dirty(inode, 0); nilfs_transaction_commit(sb); - /* May construct a logical segment and may fail in sync mode. - But truncate has no return value. */ + /* + * May construct a logical segment and may fail in sync mode. + * But truncate has no return value. + */ } static void nilfs_clear_inode(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct nilfs_mdt_info *mdi = NILFS_MDT(inode); /* * Free resources allocated in nilfs_read_inode(), here. @@ -768,8 +776,8 @@ static void nilfs_clear_inode(struct inode *inode) brelse(ii->i_bh); ii->i_bh = NULL; - if (mdi && mdi->mi_palloc_cache) - nilfs_palloc_destroy_cache(inode); + if (nilfs_is_metadata_file_inode(inode)) + nilfs_mdt_clear(inode); if (test_bit(NILFS_I_BMAP, &ii->i_state)) nilfs_bmap_clear(ii->i_bmap); @@ -811,8 +819,10 @@ void nilfs_evict_inode(struct inode *inode) if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); nilfs_transaction_commit(sb); - /* May construct a logical segment and may fail in sync mode. - But delete_inode has no return value. */ + /* + * May construct a logical segment and may fail in sync mode. + * But delete_inode has no return value. + */ } int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) @@ -856,6 +866,7 @@ out_err: int nilfs_permission(struct inode *inode, int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; + if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ @@ -906,7 +917,7 @@ int nilfs_inode_dirty(struct inode *inode) return ret; } -int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) +int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty) { struct nilfs_inode_info *ii = NILFS_I(inode); struct the_nilfs *nilfs = inode->i_sb->s_fs_info; @@ -919,17 +930,23 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) spin_lock(&nilfs->ns_inode_lock); if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && !test_bit(NILFS_I_BUSY, &ii->i_state)) { - /* Because this routine may race with nilfs_dispose_list(), - we have to check NILFS_I_QUEUED here, too. */ + /* + * Because this routine may race with nilfs_dispose_list(), + * we have to check NILFS_I_QUEUED here, too. + */ if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { - /* This will happen when somebody is freeing - this inode. */ + /* + * This will happen when somebody is freeing + * this inode. + */ nilfs_warning(inode->i_sb, __func__, - "cannot get inode (ino=%lu)\n", + "cannot get inode (ino=%lu)", inode->i_ino); spin_unlock(&nilfs->ns_inode_lock); - return -EINVAL; /* NILFS_I_DIRTY may remain for - freeing inode */ + return -EINVAL; /* + * NILFS_I_DIRTY may remain for + * freeing inode. + */ } list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); set_bit(NILFS_I_QUEUED, &ii->i_state); @@ -946,7 +963,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags) err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { nilfs_warning(inode->i_sb, __func__, - "failed to reget inode block.\n"); + "failed to reget inode block."); return err; } nilfs_update_inode(inode, ibh, flags); @@ -973,7 +990,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags) if (is_bad_inode(inode)) { nilfs_warning(inode->i_sb, __func__, - "tried to mark bad_inode dirty. ignored.\n"); + "tried to mark bad_inode dirty. ignored."); dump_stack(); return; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index e8fe24882b5b..358b57e2cdf9 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/fs.h> @@ -783,6 +779,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, size_t nmembs = argv->v_nmembs; struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; struct nilfs_bdesc *bdescs = buf; + struct buffer_head *bh; int ret, i; for (i = 0; i < nmembs; i++) { @@ -800,12 +797,16 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, /* skip dead block */ continue; if (bdescs[i].bd_level == 0) { - ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat, - bdescs[i].bd_offset); - if (ret < 0) { + ret = nilfs_mdt_get_block(nilfs->ns_dat, + bdescs[i].bd_offset, + false, NULL, &bh); + if (unlikely(ret)) { WARN_ON(ret == -ENOENT); return ret; } + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(nilfs->ns_dat); + put_bh(bh); } else { ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset, bdescs[i].bd_level); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 1125f40233ff..3417d859a03c 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> @@ -32,6 +28,7 @@ #include "segment.h" #include "page.h" #include "mdt.h" +#include "alloc.h" /* nilfs_palloc_destroy_cache() */ #include <trace/events/nilfs2.h> @@ -110,7 +107,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, failed_bh: unlock_page(bh->b_page); - page_cache_release(bh->b_page); + put_page(bh->b_page); brelse(bh); failed_unlock: @@ -170,7 +167,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, failed_bh: unlock_page(bh->b_page); - page_cache_release(bh->b_page); + put_page(bh->b_page); brelse(bh); failed: return ret; @@ -363,7 +360,7 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) { pgoff_t index = (pgoff_t)block >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); + (PAGE_SHIFT - inode->i_blkbits); struct page *page; unsigned long first_block; int ret = 0; @@ -376,7 +373,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) wait_on_page_writeback(page); first_block = (unsigned long)index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); + (PAGE_SHIFT - inode->i_blkbits); if (page_has_buffers(page)) { struct buffer_head *bh; @@ -385,7 +382,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) } still_dirty = PageDirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); if (still_dirty || invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) @@ -393,34 +390,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) return ret; } -/** - * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. - * @inode: inode of the meta data file - * @block: block offset - * - * Return Value: On success, it returns 0. On error, the following negative - * error code is returned. - * - * %-ENOMEM - Insufficient memory available. - * - * %-EIO - I/O error - * - * %-ENOENT - the specified block does not exist (hole block) - */ -int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) -{ - struct buffer_head *bh; - int err; - - err = nilfs_mdt_read_block(inode, block, 0, &bh); - if (unlikely(err)) - return err; - mark_buffer_dirty(bh); - nilfs_mdt_mark_dirty(inode); - brelse(bh); - return 0; -} - int nilfs_mdt_fetch_dirty(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); @@ -497,8 +466,32 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) return 0; } -void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, - unsigned header_size) +/** + * nilfs_mdt_clear - do cleanup for the metadata file + * @inode: inode of the metadata file + */ +void nilfs_mdt_clear(struct inode *inode) +{ + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + if (mdi->mi_palloc_cache) + nilfs_palloc_destroy_cache(inode); +} + +/** + * nilfs_mdt_destroy - release resources used by the metadata file + * @inode: inode of the metadata file + */ +void nilfs_mdt_destroy(struct inode *inode) +{ + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ + kfree(mdi); +} + +void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size, + unsigned int header_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); @@ -578,7 +571,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) } unlock_page(page); - page_cache_release(page); + put_page(page); return 0; } @@ -597,7 +590,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) bh_frozen = nilfs_page_get_nth_block(page, n); } unlock_page(page); - page_cache_release(page); + put_page(page); } return bh_frozen; } diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index 03246cac3338..3f67f3932097 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ #ifndef _NILFS_MDT_H @@ -57,8 +53,8 @@ struct nilfs_shadow_map { struct nilfs_mdt_info { struct rw_semaphore mi_sem; struct blockgroup_lock *mi_bgl; - unsigned mi_entry_size; - unsigned mi_first_entry_offset; + unsigned int mi_entry_size; + unsigned int mi_first_entry_offset; unsigned long mi_entries_per_block; struct nilfs_palloc_cache *mi_palloc_cache; struct nilfs_shadow_map *mi_shadow; @@ -71,6 +67,11 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) return inode->i_private; } +static inline int nilfs_is_metadata_file_inode(const struct inode *inode) +{ + return inode->i_private != NULL; +} + /* Default GFP flags using highmem */ #define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM) @@ -83,11 +84,13 @@ int nilfs_mdt_find_block(struct inode *inode, unsigned long start, struct buffer_head **out_bh); int nilfs_mdt_delete_block(struct inode *, unsigned long); int nilfs_mdt_forget_block(struct inode *, unsigned long); -int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz); -void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); +void nilfs_mdt_clear(struct inode *inode); +void nilfs_mdt_destroy(struct inode *inode); + +void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int); int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 7ccdb961eea9..1ec8ae5995a5 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>, - * Ryusuke Konishi <ryusuke@osrg.net> + * Modified for NILFS by Amagai Yoshiji and Ryusuke Konishi. */ /* * linux/fs/ext2/namei.c @@ -49,6 +44,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = nilfs_add_link(dentry, inode); + if (!err) { d_instantiate(dentry, inode); unlock_new_inode(inode); @@ -143,7 +139,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, { struct nilfs_transaction_info ti; struct super_block *sb = dir->i_sb; - unsigned l = strlen(symname)+1; + unsigned int l = strlen(symname) + 1; struct inode *inode; int err; @@ -288,7 +284,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { nilfs_warning(inode->i_sb, __func__, - "deleting nonexistent file (%lu), %d\n", + "deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } @@ -431,11 +427,11 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, out_dir: if (dir_de) { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } out_old: kunmap(old_page); - page_cache_release(old_page); + put_page(old_page); out: nilfs_transaction_abort(old_dir->i_sb); return err; @@ -457,7 +453,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child) root = NILFS_I(d_inode(child))->i_root; - inode = nilfs_iget(d_inode(child)->i_sb, root, ino); + inode = nilfs_iget(child->d_sb, root, ino); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 385704027575..b1d48bc0532d 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net> - * Ryusuke Konishi <ryusuke@osrg.net> + * Written by Koji Sato and Ryusuke Konishi. */ #ifndef _NILFS_H @@ -69,8 +64,10 @@ struct nilfs_inode_info { */ struct rw_semaphore xattr_sem; #endif - struct buffer_head *i_bh; /* i_bh contains a new or dirty - disk inode */ + struct buffer_head *i_bh; /* + * i_bh contains a new or dirty + * disk inode. + */ struct nilfs_root *i_root; struct inode vfs_inode; }; @@ -100,8 +97,10 @@ enum { NILFS_I_NEW = 0, /* Inode is newly created */ NILFS_I_DIRTY, /* The file is dirty */ NILFS_I_QUEUED, /* inode is in dirty_files list */ - NILFS_I_BUSY, /* inode is grabbed by a segment - constructor */ + NILFS_I_BUSY, /* + * Inode is grabbed by a segment + * constructor + */ NILFS_I_COLLECTED, /* All dirty blocks are collected */ NILFS_I_UPDATED, /* The file has been written back */ NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ @@ -145,8 +144,10 @@ enum { struct nilfs_transaction_info { u32 ti_magic; void *ti_save; - /* This should never used. If this happens, - one of other filesystems has a bug. */ + /* + * This should never be used. If it happens, + * one of other filesystems has a bug. + */ unsigned short ti_flags; unsigned short ti_count; }; @@ -156,8 +157,10 @@ struct nilfs_transaction_info { /* ti_flags */ #define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */ -#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the - end of transaction. */ +#define NILFS_TI_SYNC 0x0002 /* + * Force to construct segment at the + * end of transaction. + */ #define NILFS_TI_GC 0x0004 /* GC context */ #define NILFS_TI_COMMIT 0x0008 /* Change happened or not */ #define NILFS_TI_WRITER 0x0010 /* Constructor context */ @@ -279,7 +282,7 @@ extern void nilfs_write_failed(struct address_space *mapping, loff_t to); int nilfs_permission(struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); -int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); +int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty); extern int __nilfs_mark_inode_dirty(struct inode *, int); extern void nilfs_dirty_inode(struct inode *, int flags); int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index c20df77eff99..d97ba5f11b77 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net>, - * Seiji Kihara <kihara@osrg.net>. + * Written by Ryusuke Konishi and Seiji Kihara. */ #include <linux/pagemap.h> @@ -50,7 +45,7 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, if (!page_has_buffers(page)) create_empty_buffers(page, 1 << blkbits, b_state); - first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits); + first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); bh = nilfs_page_get_nth_block(page, block - first_block); touch_buffer(bh); @@ -64,7 +59,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, unsigned long b_state) { int blkbits = inode->i_blkbits; - pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); + pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits); struct page *page; struct buffer_head *bh; @@ -75,7 +70,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); if (unlikely(!bh)) { unlock_page(page); - page_cache_release(page); + put_page(page); return NULL; } return bh; @@ -288,7 +283,7 @@ repeat: __set_page_dirty_nobuffers(dpage); unlock_page(dpage); - page_cache_release(dpage); + put_page(dpage); unlock_page(page); } pagevec_release(&pvec); @@ -333,7 +328,7 @@ repeat: WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); - page_cache_release(dpage); + put_page(dpage); } else { struct page *page2; @@ -350,7 +345,7 @@ repeat: if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; - page_cache_release(page); /* for cache */ + put_page(page); /* for cache */ } else { page->mapping = dmap; dmap->nrpages++; @@ -440,12 +435,12 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) __nilfs_clear_page_dirty(page); } -unsigned nilfs_page_count_clean_buffers(struct page *page, - unsigned from, unsigned to) +unsigned int nilfs_page_count_clean_buffers(struct page *page, + unsigned int from, unsigned int to) { - unsigned block_start, block_end; + unsigned int block_start, block_end; struct buffer_head *bh, *head; - unsigned nc = 0; + unsigned int nc = 0; for (bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; @@ -523,8 +518,8 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, if (inode->i_mapping->nrpages == 0) return 0; - index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits); + index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); + nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); pagevec_init(&pvec, 0); @@ -537,7 +532,7 @@ repeat: if (length > 0 && pvec.pages[0]->index > index) goto out; - b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits); i = 0; do { page = pvec.pages[i]; diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index a43b8287d012..f3687c958fa8 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net>, - * Seiji Kihara <kihara@osrg.net>. + * Written by Ryusuke Konishi and Seiji Kihara. */ #ifndef _NILFS_PAGE_H @@ -58,7 +53,8 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_page(struct page *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); -unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); +unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, + unsigned int); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 9b4f205d1173..d893dc912b62 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> @@ -47,8 +43,10 @@ enum { /* work structure for recovery */ struct nilfs_recovery_block { - ino_t ino; /* Inode number of the file that this block - belongs to */ + ino_t ino; /* + * Inode number of the file that this block + * belongs to + */ sector_t blocknr; /* block number */ __u64 vblocknr; /* virtual block number */ unsigned long blkoff; /* File offset of the data block (per block) */ @@ -156,7 +154,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, sr = (struct nilfs_super_root *)bh_sr->b_data; if (check) { - unsigned bytes = le16_to_cpu(sr->sr_bytes); + unsigned int bytes = le16_to_cpu(sr->sr_bytes); if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; @@ -508,7 +506,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, { struct inode *inode; struct nilfs_recovery_block *rb, *n; - unsigned blocksize = nilfs->ns_blocksize; + unsigned int blocksize = nilfs->ns_blocksize; struct page *page; loff_t pos; int err = 0, err2 = 0; @@ -526,6 +524,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, 0, &page, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; + if (pos + blocksize > isize) nilfs_write_failed(inode->i_mapping, pos + blocksize); @@ -544,14 +543,14 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, blocksize, page, NULL); unlock_page(page); - page_cache_release(page); + put_page(page); (*nr_salvaged_blocks)++; goto next; failed_page: unlock_page(page); - page_cache_release(page); + put_page(page); failed_inode: printk(KERN_WARNING @@ -872,9 +871,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, flags = le16_to_cpu(sum->ss_flags); if (!(flags & NILFS_SS_SR) && !scan_newer) { - /* This will never happen because a superblock - (last_segment) always points to a pseg - having a super root. */ + /* + * This will never happen because a superblock + * (last_segment) always points to a pseg with + * a super root. + */ ret = NILFS_SEG_FAIL_CONSISTENCY; goto failed; } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index f63620ce3892..bf36df10540b 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -133,7 +129,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, return 0; } -int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, +int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, time_t ctime, __u64 cno) { int err; @@ -240,7 +236,7 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, { struct nilfs_super_root *raw_sr; struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; - unsigned srsize; + unsigned int srsize; u32 crc; raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index b04f08cc2397..7bbccc099709 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ #ifndef _NILFS_SEGBUF_H @@ -82,7 +78,7 @@ struct nilfs_segment_buffer { __u64 sb_nextnum; sector_t sb_fseg_start, sb_fseg_end; sector_t sb_pseg_start; - unsigned sb_rest_blocks; + unsigned int sb_rest_blocks; /* Buffers */ struct list_head sb_segsum_buffers; @@ -124,7 +120,8 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t, + __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, struct buffer_head **); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3b65adaae7e4..e78b68a81aec 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -49,18 +45,26 @@ */ #define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ -#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments - appended in collection retry loop */ +#define SC_MAX_SEGDELTA 64 /* + * Upper limit of the number of segments + * appended in collection retry loop + */ /* Construction mode */ enum { SC_LSEG_SR = 1, /* Make a logical segment having a super root */ - SC_LSEG_DSYNC, /* Flush data blocks of a given file and make - a logical segment without a super root */ - SC_FLUSH_FILE, /* Flush data files, leads to segment writes without - creating a checkpoint */ - SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without - a checkpoint */ + SC_LSEG_DSYNC, /* + * Flush data blocks of a given file and make + * a logical segment without a super root. + */ + SC_FLUSH_FILE, /* + * Flush data files, leads to segment writes without + * creating a checkpoint. + */ + SC_FLUSH_DAT, /* + * Flush DAT file. This also creates segments + * without a checkpoint. + */ }; /* Stage numbers of dirty block collection */ @@ -154,17 +158,15 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) if (cur_ti) { if (cur_ti->ti_magic == NILFS_TI_MAGIC) return ++cur_ti->ti_count; - else { - /* - * If journal_info field is occupied by other FS, - * it is saved and will be restored on - * nilfs_transaction_commit(). - */ - printk(KERN_WARNING - "NILFS warning: journal info from a different " - "FS\n"); - save = current->journal_info; - } + + /* + * If journal_info field is occupied by other FS, + * it is saved and will be restored on + * nilfs_transaction_commit(). + */ + printk(KERN_WARNING + "NILFS warning: journal info from a different FS\n"); + save = current->journal_info; } if (!ti) { ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); @@ -397,10 +399,10 @@ static void nilfs_transaction_unlock(struct super_block *sb) static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, struct nilfs_segsum_pointer *ssp, - unsigned bytes) + unsigned int bytes) { struct nilfs_segment_buffer *segbuf = sci->sc_curseg; - unsigned blocksize = sci->sc_super->s_blocksize; + unsigned int blocksize = sci->sc_super->s_blocksize; void *p; if (unlikely(ssp->offset + bytes > blocksize)) { @@ -422,8 +424,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf = sci->sc_curseg; struct buffer_head *sumbh; - unsigned sumbytes; - unsigned flags = 0; + unsigned int sumbytes; + unsigned int flags = 0; int err; if (nilfs_doing_gc()) @@ -444,8 +446,10 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) { sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) - return -E2BIG; /* The current segment is filled up - (internal code) */ + return -E2BIG; /* + * The current segment is filled up + * (internal code) + */ sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); return nilfs_segctor_reset_segment_buffer(sci); } @@ -472,9 +476,9 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) */ static int nilfs_segctor_segsum_block_required( struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, - unsigned binfo_size) + unsigned int binfo_size) { - unsigned blocksize = sci->sc_super->s_blocksize; + unsigned int blocksize = sci->sc_super->s_blocksize; /* Size of finfo and binfo is enough small against blocksize */ return ssp->offset + binfo_size + @@ -533,7 +537,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode, - unsigned binfo_size) + unsigned int binfo_size) { struct nilfs_segment_buffer *segbuf; int required, err = 0; @@ -617,7 +621,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, *vblocknr = binfo->bi_v.bi_vblocknr; } -static struct nilfs_sc_operations nilfs_sc_file_ops = { +static const struct nilfs_sc_operations nilfs_sc_file_ops = { .collect_data = nilfs_collect_file_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_file_bmap, @@ -666,7 +670,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, *binfo_dat = binfo->bi_dat; } -static struct nilfs_sc_operations nilfs_sc_dat_ops = { +static const struct nilfs_sc_operations nilfs_sc_dat_ops = { .collect_data = nilfs_collect_dat_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_dat_bmap, @@ -674,7 +678,7 @@ static struct nilfs_sc_operations nilfs_sc_dat_ops = { .write_node_binfo = nilfs_write_dat_node_binfo, }; -static struct nilfs_sc_operations nilfs_sc_dsync_ops = { +static const struct nilfs_sc_operations nilfs_sc_dsync_ops = { .collect_data = nilfs_collect_file_data, .collect_node = NULL, .collect_bmap = NULL, @@ -777,7 +781,7 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs, { struct nilfs_inode_info *ii, *n; struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; - unsigned nv = 0; + unsigned int nv = 0; while (!list_empty(head)) { spin_lock(&nilfs->ns_inode_lock); @@ -875,9 +879,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, &raw_cp, &bh_cp); if (likely(!err)) { - /* The following code is duplicated with cpfile. But, it is - needed to collect the checkpoint even if it was not newly - created */ + /* + * The following code is duplicated with cpfile. But, it is + * needed to collect the checkpoint even if it was not newly + * created. + */ mark_buffer_dirty(bh_cp); nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( @@ -958,7 +964,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; - unsigned isz, srsz; + unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; raw_sr = (struct nilfs_super_root *)bh_sr->b_data; @@ -1043,7 +1049,7 @@ static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, struct inode *inode, - struct nilfs_sc_operations *sc_ops) + const struct nilfs_sc_operations *sc_ops) { LIST_HEAD(data_buffers); LIST_HEAD(node_buffers); @@ -1406,8 +1412,10 @@ static void nilfs_free_incomplete_logs(struct list_head *logs, if (atomic_read(&segbuf->sb_err)) { /* Case 1: The first segment failed */ if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) - /* Case 1a: Partial segment appended into an existing - segment */ + /* + * Case 1a: Partial segment appended into an existing + * segment + */ nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, segbuf->sb_fseg_end); else /* Case 1b: New full segment */ @@ -1550,7 +1558,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, sector_t blocknr; unsigned long nfinfo = segbuf->sb_sum.nfinfo; unsigned long nblocks = 0, ndatablk = 0; - struct nilfs_sc_operations *sc_op = NULL; + const struct nilfs_sc_operations *sc_op = NULL; struct nilfs_segsum_pointer ssp; struct nilfs_finfo *finfo = NULL; union nilfs_binfo binfo; @@ -1631,8 +1639,10 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) static void nilfs_begin_page_io(struct page *page) { if (!page || PageWriteback(page)) - /* For split b-tree node pages, this function may be called - twice. We ignore the 2nd or later calls by this check. */ + /* + * For split b-tree node pages, this function may be called + * twice. We ignore the 2nd or later calls by this check. + */ return; lock_page(page); @@ -1942,7 +1952,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, ifile, ii->vfs_inode.i_ino, &ibh); if (unlikely(err)) { nilfs_warning(sci->sc_super, __func__, - "failed to get inode block.\n"); + "failed to get inode block."); return err; } mark_buffer_dirty(ibh); @@ -2070,7 +2080,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto failed_to_write; if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE || - nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { + nilfs->ns_blocksize_bits != PAGE_SHIFT) { /* * At this point, we avoid double buffering * for blocksize < pagesize because page dirty @@ -2395,6 +2405,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_construction_timeout(unsigned long data) { struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); } @@ -2555,10 +2566,10 @@ static int nilfs_segctor_thread(void *arg) if (timeout || sci->sc_seq_request != sci->sc_seq_done) mode = SC_LSEG_SR; - else if (!sci->sc_flush_request) - break; - else + else if (sci->sc_flush_request) mode = nilfs_segctor_flush_mode(sci); + else + break; spin_unlock(&sci->sc_state_lock); nilfs_segctor_thread_construct(sci, mode); @@ -2684,8 +2695,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) { int ret, retrycount = NILFS_SC_CLEANUP_RETRY; - /* The segctord thread was stopped and its timer was removed. - But some tasks remain. */ + /* + * The segctord thread was stopped and its timer was removed. + * But some tasks remain. + */ do { struct nilfs_transaction_info ti; @@ -2727,13 +2740,13 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) if (!list_empty(&sci->sc_dirty_files)) { nilfs_warning(sci->sc_super, __func__, - "dirty file(s) after the final construction\n"); + "dirty file(s) after the final construction"); nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); } if (!list_empty(&sci->sc_iput_queue)) { nilfs_warning(sci->sc_super, __func__, - "iput queue is not empty\n"); + "iput queue is not empty"); nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); } @@ -2810,7 +2823,7 @@ void nilfs_detach_log_writer(struct super_block *sb) if (!list_empty(&nilfs->ns_dirty_files)) { list_splice_init(&nilfs->ns_dirty_files, &garbage_list); nilfs_warning(sb, __func__, - "Hit dirty file after stopped log writer\n"); + "Hit dirty file after stopped log writer"); } spin_unlock(&nilfs->ns_inode_lock); up_write(&nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 0408b9b2814b..6565c10b7b76 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ #ifndef _NILFS_SEGMENT_H @@ -75,7 +71,7 @@ struct nilfs_recovery_info { */ struct nilfs_cstage { int scnt; - unsigned flags; + unsigned int flags; struct nilfs_inode_info *dirty_file_ptr; struct nilfs_inode_info *gc_inode_ptr; }; @@ -84,7 +80,7 @@ struct nilfs_segment_buffer; struct nilfs_segsum_pointer { struct buffer_head *bh; - unsigned offset; /* offset in bytes */ + unsigned int offset; /* offset in bytes */ }; /** @@ -193,11 +189,15 @@ enum { NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */ NILFS_SC_UNCLOSED, /* Logical segment is not closed */ NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */ - NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a - checkpoint */ - NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files - other than DAT, cpfile, sufile, or files - moved by GC */ + NILFS_SC_PRIOR_FLUSH, /* + * Requesting immediate flush without making a + * checkpoint + */ + NILFS_SC_HAVE_DELTA, /* + * Next checkpoint will have update of files + * other than DAT, cpfile, sufile, or files + * moved by GC. + */ }; /* sc_state */ @@ -207,17 +207,23 @@ enum { /* * Constant parameters */ -#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when - destroying segctord */ +#define NILFS_SC_CLEANUP_RETRY 3 /* + * Retry count of construction when + * destroying segctord + */ /* * Default values of timeout, in seconds. */ -#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks. - It triggers construction of a - logical segment with a super root */ -#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root - creation */ +#define NILFS_SC_DEFAULT_TIMEOUT 5 /* + * Timeout value of dirty blocks. + * It triggers construction of a + * logical segment with a super root. + */ +#define NILFS_SC_DEFAULT_SR_FREQ 30 /* + * Maximum frequency of super root + * creation + */ /* * The default threshold amount of data, in block counts. diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 52821ffc11f4..1963595a1580 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. - * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * Written by Koji Sato. + * Revised by Ryusuke Konishi. */ #include <linux/kernel.h> @@ -61,6 +57,7 @@ static unsigned long nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); return (unsigned long)t; } @@ -69,6 +66,7 @@ static unsigned long nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); } @@ -819,7 +817,7 @@ out: * %-ENOMEM - Insufficient amount of memory available. */ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, - unsigned sisz, size_t nsi) + unsigned int sisz, size_t nsi) { struct buffer_head *su_bh; struct nilfs_segment_usage *su; @@ -897,7 +895,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, * %-EINVAL - Invalid values in input (segment number, flags or nblocks) */ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, - unsigned supsz, size_t nsup) + unsigned int supsz, size_t nsup) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh, *bh; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index b8afd72f2379..46e89872294c 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_SUFILE_H @@ -42,9 +38,9 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, unsigned long nblocks, time_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); -ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int, size_t); -ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t); +ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned int, size_t); int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, void (*dofunc)(struct inode *, __u64, diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7f5d3d9f1c37..666107a18a22 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ /* * linux/fs/ext2/super.c @@ -173,12 +169,10 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) static void nilfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - struct nilfs_mdt_info *mdi = NILFS_MDT(inode); - if (mdi) { - kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ - kfree(mdi); - } + if (nilfs_is_metadata_file_inode(inode)) + nilfs_mdt_destroy(inode); + kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } @@ -279,7 +273,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, } } else if (sbp[1] && sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { - memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); } if (flip && sbp[1]) @@ -749,6 +743,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; @@ -891,7 +886,7 @@ int nilfs_store_magic_and_option(struct super_block *sb, nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval); nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max); - return !parse_options(data, sb, 0) ? -EINVAL : 0 ; + return !parse_options(data, sb, 0) ? -EINVAL : 0; } int nilfs_check_feature_compatibility(struct super_block *sb, @@ -1316,7 +1311,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s->s_root) { - s_new = true; + s_new = true; /* New superblock instance created */ s->s_mode = mode; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index bbb0dcc35905..8ffa42b704d8 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -68,7 +68,7 @@ static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \ static const struct sysfs_ops nilfs_##name##_attr_ops = { \ .show = nilfs_##name##_attr_show, \ .store = nilfs_##name##_attr_store, \ -}; +} #define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ static void nilfs_##name##_attr_release(struct kobject *kobj) \ @@ -84,7 +84,7 @@ static struct kobj_type nilfs_##name##_ktype = { \ .default_attrs = nilfs_##name##_attrs, \ .sysfs_ops = &nilfs_##name##_attr_ops, \ .release = nilfs_##name##_attr_release, \ -}; +} #define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ @@ -756,7 +756,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - unsigned sbwcount; + unsigned int sbwcount; down_read(&nilfs->ns_sem); sbwcount = nilfs->ns_sbwcount; @@ -770,7 +770,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - unsigned sb_update_freq; + unsigned int sb_update_freq; down_read(&nilfs->ns_sem); sb_update_freq = nilfs->ns_sb_update_freq; @@ -784,7 +784,7 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, const char *buf, size_t count) { - unsigned val; + unsigned int val; int err; err = kstrtouint(skip_spaces(buf), 0, &val); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 677e3a1a8370..648cedf9c06e 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -66,7 +66,7 @@ struct nilfs_##name##_attr { \ char *); \ ssize_t (*store)(struct kobject *, struct attribute *, \ const char *, size_t); \ -}; +} NILFS_COMMON_ATTR_STRUCT(feature); @@ -77,7 +77,7 @@ struct nilfs_##name##_attr { \ char *); \ ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \ const char *, size_t); \ -}; +} NILFS_DEV_ATTR_STRUCT(dev); NILFS_DEV_ATTR_STRUCT(segments); @@ -93,7 +93,7 @@ struct nilfs_##name##_attr { \ char *); \ ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \ const char *, size_t); \ -}; +} NILFS_CP_ATTR_STRUCT(snapshot); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 69bd801afb53..809bd2de7ad0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -112,8 +108,8 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, struct nilfs_super_root *raw_sr; struct nilfs_super_block **sbp = nilfs->ns_sbp; struct nilfs_inode *rawi; - unsigned dat_entry_size, segment_usage_size, checkpoint_size; - unsigned inode_size; + unsigned int dat_entry_size, segment_usage_size, checkpoint_size; + unsigned int inode_size; int err; err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1); @@ -621,8 +617,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); if (err) goto out; - /* not failed_sbh; sbh is released automatically - when reloading fails. */ + /* + * Not to failed_sbh; sbh is released automatically + * when reloading fails. + */ } nilfs->ns_blocksize_bits = sb->s_blocksize_bits; nilfs->ns_blocksize = blocksize; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 23778d385836..79369fd6b13b 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -118,10 +114,10 @@ struct the_nilfs { struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; time_t ns_sbwtime; - unsigned ns_sbwcount; - unsigned ns_sbsize; - unsigned ns_mount_state; - unsigned ns_sb_update_freq; + unsigned int ns_sbwcount; + unsigned int ns_sbsize; + unsigned int ns_mount_state; + unsigned int ns_sb_update_freq; /* * Following fields are dedicated to a writable FS-instance. @@ -226,15 +222,14 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) * Mount option operations */ #define nilfs_clear_opt(nilfs, opt) \ - do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0) + ((nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt) #define nilfs_set_opt(nilfs, opt) \ - do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0) + ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt) #define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt) #define nilfs_write_opt(nilfs, mask, opt) \ - do { (nilfs)->ns_mount_opt = \ + ((nilfs)->ns_mount_opt = \ (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \ - NILFS_MOUNT_##opt); \ - } while (0) + NILFS_MOUNT_##opt)) \ /** * struct nilfs_root - nilfs root object @@ -273,6 +268,7 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { u64 t = get_seconds(); + return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; } @@ -280,6 +276,7 @@ static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) { int flip_bits = nilfs->ns_sbwcount & 0x0FL; + return (flip_bits != 0x08 && flip_bits != 0x0F); } @@ -308,7 +305,7 @@ static inline void nilfs_get_root(struct nilfs_root *root) static inline int nilfs_valid_fs(struct the_nilfs *nilfs) { - unsigned valid_fs; + unsigned int valid_fs; down_read(&nilfs->ns_sem); valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index b44c68a857e7..0a3bc2cf192c 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, &mnt->mnt_root->d_lock); } +/* prepare for freeing all marks associated with given group */ +extern void fsnotify_detach_group_marks(struct fsnotify_group *group); +/* + * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list + */ +extern void fsnotify_mark_destroy_list(void); + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/group.c b/fs/notify/group.c index d16b62cb2854..3e2dd85be5dd 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { - /* clear all inode marks for this group */ - fsnotify_clear_marks_by_group(group); + /* clear all inode marks for this group, attach them to destroy_list */ + fsnotify_detach_group_marks(group); - synchronize_srcu(&fsnotify_mark_srcu); + /* + * Wait for fsnotify_mark_srcu period to end and free all marks in + * destroy_list + */ + fsnotify_mark_destroy_list(); - /* clear the notification queue of all events */ + /* + * Since we have waited for fsnotify_mark_srcu in + * fsnotify_mark_destroy_list() there can be no outstanding event + * notification against this group. So clearing the notification queue + * of all events is reliable now. + */ fsnotify_flush_notify(group); /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 7115c5d7d373..d3fea0bd89e2 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); -static void fsnotify_mark_destroy(struct work_struct *work); -static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); +static void fsnotify_mark_destroy_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) } /* - * Free fsnotify mark. The freeing is actually happening from a kthread which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. + * Prepare mark for freeing and add it to the list of marks prepared for + * freeing. The actual freeing must happen after SRCU period ends and the + * caller is responsible for this. + * + * The function returns true if the mark was added to the list of marks for + * freeing. The function returns false if someone else has already called + * __fsnotify_free_mark() for the mark. */ -void fsnotify_free_mark(struct fsnotify_mark *mark) +static bool __fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; @@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - return; + return false; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark @@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); + + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + + return true; +} + +/* + * Free fsnotify mark. The freeing is actually happening from a workqueue which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. + */ +void fsnotify_free_mark(struct fsnotify_mark *mark) +{ + if (__fsnotify_free_mark(mark)) { + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); + } } void fsnotify_destroy_mark(struct fsnotify_mark *mark, @@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, } /* - * Given a group, destroy all of the marks associated with that group. + * Given a group, prepare for freeing all the marks associated with that group. + * The marks are attached to the list of marks prepared for destruction, the + * caller is responsible for freeing marks in that list after SRCU period has + * ended. */ -void fsnotify_clear_marks_by_group(struct fsnotify_group *group) +void fsnotify_detach_group_marks(struct fsnotify_group *group) { - fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); + struct fsnotify_mark *mark; + + while (1) { + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + if (list_empty(&group->marks_list)) { + mutex_unlock(&group->mark_mutex); + break; + } + mark = list_first_entry(&group->marks_list, + struct fsnotify_mark, g_list); + fsnotify_get_mark(mark); + fsnotify_detach_mark(mark); + mutex_unlock(&group->mark_mutex); + __fsnotify_free_mark(mark); + fsnotify_put_mark(mark); + } } void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) @@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, mark->free_mark = free_mark; } -static void fsnotify_mark_destroy(struct work_struct *work) +/* + * Destroy all marks in destroy_list, waits for SRCU period to finish before + * actually freeing marks. + */ +void fsnotify_mark_destroy_list(void) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; @@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work) fsnotify_put_mark(mark); } } + +static void fsnotify_mark_destroy_workfn(struct work_struct *work) +{ + fsnotify_mark_destroy_list(); +} diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 7521e11db728..97768a1379f2 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -74,7 +74,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); - file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) + + file_ofs = ((s64)page->index << PAGE_SHIFT) + bh_offset(bh); read_lock_irqsave(&ni->size_lock, flags); init_size = ni->initialized_size; @@ -142,7 +142,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) u32 rec_size; rec_size = ni->itype.index.block_size; - recs = PAGE_CACHE_SIZE / rec_size; + recs = PAGE_SIZE / rec_size; /* Should have been verified before we got here... */ BUG_ON(!recs); local_irq_save(flags); @@ -229,7 +229,7 @@ static int ntfs_read_block(struct page *page) * fully truncated, truncate will throw it away as soon as we unlock * it so no need to worry what we do with it. */ - iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; init_size = ni->initialized_size; @@ -412,9 +412,9 @@ retry_readpage: vi = page->mapping->host; i_size = i_size_read(vi); /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) { - zero_user(page, 0, PAGE_CACHE_SIZE); + if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + PAGE_SHIFT)) { + zero_user(page, 0, PAGE_SIZE); ntfs_debug("Read outside i_size - truncated?"); goto done; } @@ -463,7 +463,7 @@ retry_readpage: * ok to ignore the compressed flag here. */ if (unlikely(page->index > 0)) { - zero_user(page, 0, PAGE_CACHE_SIZE); + zero_user(page, 0, PAGE_SIZE); goto done; } if (!NInoAttr(ni)) @@ -509,7 +509,7 @@ retry_readpage: le16_to_cpu(ctx->attr->data.resident.value_offset), attr_len); /* Zero the remainder of the page. */ - memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + memset(addr + attr_len, 0, PAGE_SIZE - attr_len); flush_dcache_page(page); kunmap_atomic(addr); put_unm_err_out: @@ -599,7 +599,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) /* NOTE: Different naming scheme to ntfs_read_block()! */ /* The first block in the page. */ - block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + block = (s64)page->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); i_size = i_size_read(vi); @@ -674,7 +674,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) // in the inode. // Again, for each page do: // __set_page_dirty_buffers(); - // page_cache_release() + // put_page() // We don't need to wait on the writes. // Update iblock. } @@ -925,7 +925,7 @@ static int ntfs_write_mst_block(struct page *page, ntfs_volume *vol = ni->vol; u8 *kaddr; unsigned int rec_size = ni->itype.index.block_size; - ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size]; + ntfs_inode *locked_nis[PAGE_SIZE / rec_size]; struct buffer_head *bh, *head, *tbh, *rec_start_bh; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; runlist_element *rl; @@ -949,7 +949,7 @@ static int ntfs_write_mst_block(struct page *page, (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); bh_size = vol->sb->s_blocksize; bh_size_bits = vol->sb->s_blocksize_bits; - max_bhs = PAGE_CACHE_SIZE / bh_size; + max_bhs = PAGE_SIZE / bh_size; BUG_ON(!max_bhs); BUG_ON(max_bhs > MAX_BUF_PER_PAGE); @@ -961,13 +961,13 @@ static int ntfs_write_mst_block(struct page *page, BUG_ON(!bh); rec_size_bits = ni->itype.index.block_size_bits; - BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits)); + BUG_ON(!(PAGE_SIZE >> rec_size_bits)); bhs_per_rec = rec_size >> bh_size_bits; BUG_ON(!bhs_per_rec); /* The first block in the page. */ rec_block = block = (sector_t)page->index << - (PAGE_CACHE_SHIFT - bh_size_bits); + (PAGE_SHIFT - bh_size_bits); /* The first out of bounds block for the data size. */ dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; @@ -1133,7 +1133,7 @@ lock_retry_remap: unsigned long mft_no; /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs) + mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) >> rec_size_bits; /* Check whether to write this mft record. */ tni = NULL; @@ -1249,7 +1249,7 @@ do_mirror: continue; ofs = bh_offset(tbh); /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs) + mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) >> rec_size_bits; if (mft_no < vol->mftmirr_size) ntfs_sync_mft_mirror(vol, mft_no, @@ -1300,7 +1300,7 @@ done: * Set page error if there is only one ntfs record in the page. * Otherwise we would loose per-record granularity. */ - if (ni->itype.index.block_size == PAGE_CACHE_SIZE) + if (ni->itype.index.block_size == PAGE_SIZE) SetPageError(page); NVolSetErrors(vol); } @@ -1308,7 +1308,7 @@ done: ntfs_debug("Page still contains one or more dirty ntfs " "records. Redirtying the page starting at " "record 0x%lx.", page->index << - (PAGE_CACHE_SHIFT - rec_size_bits)); + (PAGE_SHIFT - rec_size_bits)); redirty_page_for_writepage(wbc, page); unlock_page(page); } else { @@ -1365,13 +1365,13 @@ retry_writepage: BUG_ON(!PageLocked(page)); i_size = i_size_read(vi); /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) { + if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + PAGE_SHIFT)) { /* * The page may have dirty, unmapped buffers. Make them * freeable here, so the page does not leak. */ - block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + block_invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); ntfs_debug("Write outside i_size - truncated?"); return 0; @@ -1414,10 +1414,10 @@ retry_writepage: /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* We have to zero every time due to mmap-at-end-of-file. */ - if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) { + if (page->index >= (i_size >> PAGE_SHIFT)) { /* The page straddles i_size. */ - unsigned int ofs = i_size & ~PAGE_CACHE_MASK; - zero_user_segment(page, ofs, PAGE_CACHE_SIZE); + unsigned int ofs = i_size & ~PAGE_MASK; + zero_user_segment(page, ofs, PAGE_SIZE); } /* Handle mst protected attributes. */ if (NInoMstProtected(ni)) @@ -1500,7 +1500,7 @@ retry_writepage: le16_to_cpu(ctx->attr->data.resident.value_offset), addr, attr_len); /* Zero out of bounds area in the page cache page. */ - memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + memset(addr + attr_len, 0, PAGE_SIZE - attr_len); kunmap_atomic(addr); flush_dcache_page(page); flush_dcache_mft_record_page(ctx->ntfs_ino); diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h index caecc58f529c..820d6eabf60f 100644 --- a/fs/ntfs/aops.h +++ b/fs/ntfs/aops.h @@ -40,7 +40,7 @@ static inline void ntfs_unmap_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } /** @@ -49,7 +49,7 @@ static inline void ntfs_unmap_page(struct page *page) * @index: index into the page cache for @mapping of the page to map * * Read a page from the page cache of the address space @mapping at position - * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes. + * @index, where @index is in units of PAGE_SIZE, and not in bytes. * * If the page is not in memory it is loaded from disk first using the readpage * method defined in the address space operations of @mapping and the page is diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 250ed5b20c8f..44a39a099b54 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -152,7 +152,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != old_ctx.base_ntfs_ino) { put_this_page = old_ctx.ntfs_ino->page; - page_cache_get(put_this_page); + get_page(put_this_page); } /* * Reinitialize the search context so we can lookup the @@ -275,7 +275,7 @@ retry_map: * the pieces anyway. */ if (put_this_page) - page_cache_release(put_this_page); + put_page(put_this_page); } return err; } @@ -1660,7 +1660,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) memcpy(kaddr, (u8*)a + le16_to_cpu(a->data.resident.value_offset), attr_size); - memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size); + memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size); kunmap_atomic(kaddr); flush_dcache_page(page); SetPageUptodate(page); @@ -1748,7 +1748,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) if (page) { set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } ntfs_debug("Done."); return 0; @@ -1835,7 +1835,7 @@ rl_err_out: ntfs_free(rl); page_err_out: unlock_page(page); - page_cache_release(page); + put_page(page); } if (err == -EINVAL) err = -EIO; @@ -2513,17 +2513,17 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) BUG_ON(NInoEncrypted(ni)); mapping = VFS_I(ni)->i_mapping; /* Work out the starting index and page offset. */ - idx = ofs >> PAGE_CACHE_SHIFT; - start_ofs = ofs & ~PAGE_CACHE_MASK; + idx = ofs >> PAGE_SHIFT; + start_ofs = ofs & ~PAGE_MASK; /* Work out the ending index and page offset. */ end = ofs + cnt; - end_ofs = end & ~PAGE_CACHE_MASK; + end_ofs = end & ~PAGE_MASK; /* If the end is outside the inode size return -ESPIPE. */ if (unlikely(end > i_size_read(VFS_I(ni)))) { ntfs_error(vol->sb, "Request exceeds end of attribute."); return -ESPIPE; } - end >>= PAGE_CACHE_SHIFT; + end >>= PAGE_SHIFT; /* If there is a first partial page, need to do it the slow way. */ if (start_ofs) { page = read_mapping_page(mapping, idx, NULL); @@ -2536,7 +2536,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) * If the last page is the same as the first page, need to * limit the write to the end offset. */ - size = PAGE_CACHE_SIZE; + size = PAGE_SIZE; if (idx == end) size = end_ofs; kaddr = kmap_atomic(page); @@ -2544,7 +2544,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) flush_dcache_page(page); kunmap_atomic(kaddr); set_page_dirty(page); - page_cache_release(page); + put_page(page); balance_dirty_pages_ratelimited(mapping); cond_resched(); if (idx == end) @@ -2561,7 +2561,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) return -ENOMEM; } kaddr = kmap_atomic(page); - memset(kaddr, val, PAGE_CACHE_SIZE); + memset(kaddr, val, PAGE_SIZE); flush_dcache_page(page); kunmap_atomic(kaddr); /* @@ -2585,7 +2585,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) set_page_dirty(page); /* Finally unlock and release the page. */ unlock_page(page); - page_cache_release(page); + put_page(page); balance_dirty_pages_ratelimited(mapping); cond_resched(); } @@ -2602,7 +2602,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) flush_dcache_page(page); kunmap_atomic(kaddr); set_page_dirty(page); - page_cache_release(page); + put_page(page); balance_dirty_pages_ratelimited(mapping); cond_resched(); } diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c index 0809cf876098..ec130c588d2b 100644 --- a/fs/ntfs/bitmap.c +++ b/fs/ntfs/bitmap.c @@ -67,8 +67,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, * Calculate the indices for the pages containing the first and last * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. */ - index = start_bit >> (3 + PAGE_CACHE_SHIFT); - end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT); + index = start_bit >> (3 + PAGE_SHIFT); + end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT); /* Get the page containing the first bit (@start_bit). */ mapping = vi->i_mapping; @@ -82,7 +82,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, kaddr = page_address(page); /* Set @pos to the position of the byte containing @start_bit. */ - pos = (start_bit >> 3) & ~PAGE_CACHE_MASK; + pos = (start_bit >> 3) & ~PAGE_MASK; /* Calculate the position of @start_bit in the first byte. */ bit = start_bit & 7; @@ -108,7 +108,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, * Depending on @value, modify all remaining whole bytes in the page up * to @cnt. */ - len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos); + len = min_t(s64, cnt >> 3, PAGE_SIZE - pos); memset(kaddr + pos, value ? 0xff : 0, len); cnt -= len << 3; @@ -132,7 +132,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, * Depending on @value, modify all remaining whole bytes in the * page up to @cnt. */ - len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE); + len = min_t(s64, cnt >> 3, PAGE_SIZE); memset(kaddr, value ? 0xff : 0, len); cnt -= len << 3; } diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index f82498c35e78..f2b5e746f49b 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -104,16 +104,12 @@ static void zero_partial_compressed_page(struct page *page, unsigned int kp_ofs; ntfs_debug("Zeroing page region outside initialized size."); - if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) { - /* - * FIXME: Using clear_page() will become wrong when we get - * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem. - */ + if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { clear_page(kp); return; } - kp_ofs = initialized_size & ~PAGE_CACHE_MASK; - memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs); + kp_ofs = initialized_size & ~PAGE_MASK; + memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); return; } @@ -123,7 +119,7 @@ static void zero_partial_compressed_page(struct page *page, static inline void handle_bounds_compressed_page(struct page *page, const loff_t i_size, const s64 initialized_size) { - if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) && + if ((page->index >= (initialized_size >> PAGE_SHIFT)) && (initialized_size < i_size)) zero_partial_compressed_page(page, initialized_size); return; @@ -160,7 +156,7 @@ static inline void handle_bounds_compressed_page(struct page *page, * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was * completed during the decompression of the compression block (@cb_start). * - * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up + * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up * unpredicatbly! You have been warned! * * Note to hackers: This function may not sleep until it has finished accessing @@ -241,7 +237,7 @@ return_error: if (di == xpage) *xpage_done = 1; else - page_cache_release(dp); + put_page(dp); dest_pages[di] = NULL; } } @@ -274,7 +270,7 @@ return_error: cb = cb_sb_end; /* Advance destination position to next sub-block. */ - *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK; + *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; if (!*dest_ofs && (++*dest_index > dest_max_index)) goto return_overflow; goto do_next_sb; @@ -301,7 +297,7 @@ return_error: /* Advance destination position to next sub-block. */ *dest_ofs += NTFS_SB_SIZE; - if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) { + if (!(*dest_ofs &= ~PAGE_MASK)) { finalize_page: /* * First stage: add current page index to array of @@ -335,7 +331,7 @@ do_next_tag: *dest_ofs += nr_bytes; } /* We have finished the current sub-block. */ - if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) + if (!(*dest_ofs &= ~PAGE_MASK)) goto finalize_page; goto do_next_sb; } @@ -462,7 +458,7 @@ return_overflow: * have been written to so that we would lose data if we were to just overwrite * them with the out-of-date uncompressed data. * - * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at + * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at * the end of the file I think. We need to detect this case and zero the out * of bounds remainder of the page in question and mark it as handled. At the * moment we would just return -EIO on such a page. This bug will only become @@ -470,7 +466,7 @@ return_overflow: * clusters so is probably not going to be seen by anyone. Still this should * be fixed. (AIA) * - * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in + * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in * handling sparse and compressed cbs. (AIA) * * FIXME: At the moment we don't do any zeroing out in the case that @@ -497,14 +493,14 @@ int ntfs_read_compressed_block(struct page *page) u64 cb_size_mask = cb_size - 1UL; VCN vcn; LCN lcn; - /* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */ - VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >> + /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ + VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> vol->cluster_size_bits; /* * The first vcn after the last wanted vcn (minimum alignment is again - * PAGE_CACHE_SIZE. + * PAGE_SIZE. */ - VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1) + VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) & ~cb_size_mask) >> vol->cluster_size_bits; /* Number of compression blocks (cbs) in the wanted vcn range. */ unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits @@ -515,7 +511,7 @@ int ntfs_read_compressed_block(struct page *page) * guarantees of start_vcn and end_vcn, no need to round up here. */ unsigned int nr_pages = (end_vcn - start_vcn) << - vol->cluster_size_bits >> PAGE_CACHE_SHIFT; + vol->cluster_size_bits >> PAGE_SHIFT; unsigned int xpage, max_page, cur_page, cur_ofs, i; unsigned int cb_clusters, cb_max_ofs; int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; @@ -549,7 +545,7 @@ int ntfs_read_compressed_block(struct page *page) * We have already been given one page, this is the one we must do. * Once again, the alignment guarantees keep it simple. */ - offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT; + offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; xpage = index - offset; pages[xpage] = page; /* @@ -560,13 +556,13 @@ int ntfs_read_compressed_block(struct page *page) i_size = i_size_read(VFS_I(ni)); initialized_size = ni->initialized_size; read_unlock_irqrestore(&ni->size_lock, flags); - max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - offset; /* Is the page fully outside i_size? (truncate in progress) */ if (xpage >= max_page) { kfree(bhs); kfree(pages); - zero_user(page, 0, PAGE_CACHE_SIZE); + zero_user(page, 0, PAGE_SIZE); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); unlock_page(page); @@ -591,7 +587,7 @@ int ntfs_read_compressed_block(struct page *page) continue; } unlock_page(page); - page_cache_release(page); + put_page(page); pages[i] = NULL; } } @@ -735,9 +731,9 @@ lock_retry_remap: ntfs_debug("Successfully read the compression block."); /* The last page and maximum offset within it for the current cb. */ - cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size; - cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK; - cb_max_page >>= PAGE_CACHE_SHIFT; + cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; + cb_max_ofs = cb_max_page & ~PAGE_MASK; + cb_max_page >>= PAGE_SHIFT; /* Catch end of file inside a compression block. */ if (cb_max_page > max_page) @@ -753,16 +749,11 @@ lock_retry_remap: for (; cur_page < cb_max_page; cur_page++) { page = pages[cur_page]; if (page) { - /* - * FIXME: Using clear_page() will become wrong - * when we get PAGE_CACHE_SIZE != PAGE_SIZE but - * for now there is no problem. - */ if (likely(!cur_ofs)) clear_page(page_address(page)); else memset(page_address(page) + cur_ofs, 0, - PAGE_CACHE_SIZE - + PAGE_SIZE - cur_ofs); flush_dcache_page(page); kunmap(page); @@ -771,10 +762,10 @@ lock_retry_remap: if (cur_page == xpage) xpage_done = 1; else - page_cache_release(page); + put_page(page); pages[cur_page] = NULL; } - cb_pos += PAGE_CACHE_SIZE - cur_ofs; + cb_pos += PAGE_SIZE - cur_ofs; cur_ofs = 0; if (cb_pos >= cb_end) break; @@ -807,7 +798,7 @@ lock_retry_remap: * synchronous io for the majority of pages. * Or if we choose not to do the read-ahead/-behind stuff, we * could just return block_read_full_page(pages[xpage]) as long - * as PAGE_CACHE_SIZE <= cb_size. + * as PAGE_SIZE <= cb_size. */ if (cb_max_ofs) cb_max_page--; @@ -816,8 +807,8 @@ lock_retry_remap: page = pages[cur_page]; if (page) memcpy(page_address(page) + cur_ofs, cb_pos, - PAGE_CACHE_SIZE - cur_ofs); - cb_pos += PAGE_CACHE_SIZE - cur_ofs; + PAGE_SIZE - cur_ofs); + cb_pos += PAGE_SIZE - cur_ofs; cur_ofs = 0; if (cb_pos >= cb_end) break; @@ -850,10 +841,10 @@ lock_retry_remap: if (cur2_page == xpage) xpage_done = 1; else - page_cache_release(page); + put_page(page); pages[cur2_page] = NULL; } - cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2; + cb_pos2 += PAGE_SIZE - cur_ofs2; cur_ofs2 = 0; if (cb_pos2 >= cb_end) break; @@ -884,7 +875,7 @@ lock_retry_remap: kunmap(page); unlock_page(page); if (prev_cur_page != xpage) - page_cache_release(page); + put_page(page); pages[prev_cur_page] = NULL; } } @@ -914,7 +905,7 @@ lock_retry_remap: kunmap(page); unlock_page(page); if (cur_page != xpage) - page_cache_release(page); + put_page(page); pages[cur_page] = NULL; } } @@ -961,7 +952,7 @@ err_out: kunmap(page); unlock_page(page); if (i != xpage) - page_cache_release(page); + put_page(page); } } kfree(pages); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index b2eff5816adc..a18613579001 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -315,11 +315,11 @@ found_it: descend_into_child_node: /* * Convert vcn to index into the index allocation attribute in units - * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * of PAGE_SIZE and map the page cache page, reading it from * disk if necessary. */ page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); if (IS_ERR(page)) { ntfs_error(sb, "Failed to map directory index page, error %ld.", -PTR_ERR(page)); @@ -331,9 +331,9 @@ descend_into_child_node: fast_descend_into_child_node: /* Get to the index allocation block. */ ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { ntfs_error(sb, "Out of bounds check failed. Corrupt directory " "inode 0x%lx or driver bug.", dir_ni->mft_no); goto unm_err_out; @@ -366,7 +366,7 @@ fast_descend_into_child_node: goto unm_err_out; } index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_CACHE_SIZE) { + if (index_end > kaddr + PAGE_SIZE) { ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " "0x%lx crosses page boundary. Impossible! " "Cannot access! This is probably a bug in the " @@ -559,9 +559,9 @@ found_it2: /* If vcn is in the same page cache page as old_vcn we * recycle the mapped page. */ if (old_vcn << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT == vcn << + PAGE_SHIFT == vcn << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT) + PAGE_SHIFT) goto fast_descend_into_child_node; unlock_page(page); ntfs_unmap_page(page); @@ -793,11 +793,11 @@ found_it: descend_into_child_node: /* * Convert vcn to index into the index allocation attribute in units - * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * of PAGE_SIZE and map the page cache page, reading it from * disk if necessary. */ page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); if (IS_ERR(page)) { ntfs_error(sb, "Failed to map directory index page, error %ld.", -PTR_ERR(page)); @@ -809,9 +809,9 @@ descend_into_child_node: fast_descend_into_child_node: /* Get to the index allocation block. */ ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { ntfs_error(sb, "Out of bounds check failed. Corrupt directory " "inode 0x%lx or driver bug.", dir_ni->mft_no); goto unm_err_out; @@ -844,7 +844,7 @@ fast_descend_into_child_node: goto unm_err_out; } index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_CACHE_SIZE) { + if (index_end > kaddr + PAGE_SIZE) { ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " "0x%lx crosses page boundary. Impossible! " "Cannot access! This is probably a bug in the " @@ -968,9 +968,9 @@ found_it2: /* If vcn is in the same page cache page as old_vcn we * recycle the mapped page. */ if (old_vcn << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT == vcn << + PAGE_SHIFT == vcn << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT) + PAGE_SHIFT) goto fast_descend_into_child_node; unlock_page(page); ntfs_unmap_page(page); @@ -1246,15 +1246,15 @@ skip_index_root: goto iput_err_out; } /* Get the starting bit position in the current bitmap page. */ - cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1); - bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1); + cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1); + bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1); get_next_bmp_page: ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", - (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT), + (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT), (unsigned long long)bmp_pos & - (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1)); + (unsigned long long)((PAGE_SIZE * 8) - 1)); bmp_page = ntfs_map_page(bmp_mapping, - bmp_pos >> (3 + PAGE_CACHE_SHIFT)); + bmp_pos >> (3 + PAGE_SHIFT)); if (IS_ERR(bmp_page)) { ntfs_error(sb, "Reading index bitmap failed."); err = PTR_ERR(bmp_page); @@ -1270,9 +1270,9 @@ find_next_index_buffer: * If we have reached the end of the bitmap page, get the next * page, and put away the old one. */ - if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) { + if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) { ntfs_unmap_page(bmp_page); - bmp_pos += PAGE_CACHE_SIZE * 8; + bmp_pos += PAGE_SIZE * 8; cur_bmp_pos = 0; goto get_next_bmp_page; } @@ -1285,8 +1285,8 @@ find_next_index_buffer: ntfs_debug("Handling index buffer 0x%llx.", (unsigned long long)bmp_pos + cur_bmp_pos); /* If the current index buffer is in the same page we reuse the page. */ - if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) != - (ia_pos & (s64)PAGE_CACHE_MASK)) { + if ((prev_ia_pos & (s64)PAGE_MASK) != + (ia_pos & (s64)PAGE_MASK)) { prev_ia_pos = ia_pos; if (likely(ia_page != NULL)) { unlock_page(ia_page); @@ -1296,7 +1296,7 @@ find_next_index_buffer: * Map the page cache page containing the current ia_pos, * reading it from disk if necessary. */ - ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT); + ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT); if (IS_ERR(ia_page)) { ntfs_error(sb, "Reading index allocation data failed."); err = PTR_ERR(ia_page); @@ -1307,10 +1307,10 @@ find_next_index_buffer: kaddr = (u8*)page_address(ia_page); } /* Get the current index buffer. */ - ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK & - ~(s64)(ndir->itype.index.block_size - 1))); + ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK & + ~(s64)(ndir->itype.index.block_size - 1))); /* Bounds checks. */ - if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) { + if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) { ntfs_error(sb, "Out of bounds check failed. Corrupt directory " "inode 0x%lx or driver bug.", vdir->i_ino); goto err_out; @@ -1348,7 +1348,7 @@ find_next_index_buffer: goto err_out; } index_end = (u8*)ia + ndir->itype.index.block_size; - if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) { + if (unlikely(index_end > kaddr + PAGE_SIZE)) { ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " "0x%lx crosses page boundary. Impossible! " "Cannot access! This is probably a bug in the " diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index bed4d427dfae..5622ed5a201e 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -220,8 +220,8 @@ do_non_resident_extend: m = NULL; } mapping = vi->i_mapping; - index = old_init_size >> PAGE_CACHE_SHIFT; - end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + index = old_init_size >> PAGE_SHIFT; + end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; do { /* * Read the page. If the page is not present, this will zero @@ -233,7 +233,7 @@ do_non_resident_extend: goto init_err_out; } if (unlikely(PageError(page))) { - page_cache_release(page); + put_page(page); err = -EIO; goto init_err_out; } @@ -242,13 +242,13 @@ do_non_resident_extend: * enough to make ntfs_writepage() work. */ write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT; + ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; if (ni->initialized_size > new_init_size) ni->initialized_size = new_init_size; write_unlock_irqrestore(&ni->size_lock, flags); /* Set the page dirty so it gets written out. */ set_page_dirty(page); - page_cache_release(page); + put_page(page); /* * Play nice with the vm and the rest of the system. This is * very much needed as we can potentially be modifying the @@ -543,7 +543,7 @@ out: err_out: while (nr > 0) { unlock_page(pages[--nr]); - page_cache_release(pages[nr]); + put_page(pages[nr]); } goto out; } @@ -573,7 +573,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) * only partially being written to. * * If @nr_pages is greater than one, we are guaranteed that the cluster size is - * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside + * greater than PAGE_SIZE, that all pages in @pages are entirely inside * the same cluster and that they are the entirety of that cluster, and that * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. * @@ -653,7 +653,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, u = 0; do_next_page: page = pages[u]; - bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; + bh_pos = (s64)page->index << PAGE_SHIFT; bh = head = page_buffers(page); do { VCN cdelta; @@ -810,11 +810,11 @@ map_buffer_cached: kaddr = kmap_atomic(page); if (bh_pos < pos) { - pofs = bh_pos & ~PAGE_CACHE_MASK; + pofs = bh_pos & ~PAGE_MASK; memset(kaddr + pofs, 0, pos - bh_pos); } if (bh_end > end) { - pofs = end & ~PAGE_CACHE_MASK; + pofs = end & ~PAGE_MASK; memset(kaddr + pofs, 0, bh_end - end); } kunmap_atomic(kaddr); @@ -942,7 +942,7 @@ rl_not_mapped_enoent: * unmapped. This can only happen when the cluster size is * less than the page cache size. */ - if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) { + if (unlikely(vol->cluster_size < PAGE_SIZE)) { bh_cend = (bh_end + vol->cluster_size - 1) >> vol->cluster_size_bits; if ((bh_cend <= cpos || bh_cpos >= cend)) { @@ -1208,7 +1208,7 @@ rl_not_mapped_enoent: wait_on_buffer(bh); if (likely(buffer_uptodate(bh))) { page = bh->b_page; - bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) + + bh_pos = ((s64)page->index << PAGE_SHIFT) + bh_offset(bh); /* * If the buffer overflows the initialized size, need @@ -1350,7 +1350,7 @@ rl_not_mapped_enoent: bh = head = page_buffers(page); do { if (u == nr_pages && - ((s64)page->index << PAGE_CACHE_SHIFT) + + ((s64)page->index << PAGE_SHIFT) + bh_offset(bh) >= end) break; if (!buffer_new(bh)) @@ -1422,7 +1422,7 @@ static inline int ntfs_commit_pages_after_non_resident_write( bool partial; page = pages[u]; - bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; + bh_pos = (s64)page->index << PAGE_SHIFT; bh = head = page_buffers(page); partial = false; do { @@ -1639,7 +1639,7 @@ static int ntfs_commit_pages_after_write(struct page **pages, if (end < attr_len) memcpy(kaddr + end, kattr + end, attr_len - end); /* Zero the region outside the end of the attribute value. */ - memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); flush_dcache_page(page); SetPageUptodate(page); } @@ -1706,7 +1706,7 @@ static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, unsigned len, copied; do { - len = PAGE_CACHE_SIZE - ofs; + len = PAGE_SIZE - ofs; if (len > bytes) len = bytes; copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs, @@ -1724,14 +1724,14 @@ out: return total; err: /* Zero the rest of the target like __copy_from_user(). */ - len = PAGE_CACHE_SIZE - copied; + len = PAGE_SIZE - copied; do { if (len > bytes) len = bytes; zero_user(*pages, copied, len); bytes -= len; copied = 0; - len = PAGE_CACHE_SIZE; + len = PAGE_SIZE; } while (++pages < last_page); goto out; } @@ -1787,8 +1787,8 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, * attributes. */ nr_pages = 1; - if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni)) - nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT; + if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) + nr_pages = vol->cluster_size >> PAGE_SHIFT; last_vcn = -1; do { VCN vcn; @@ -1796,9 +1796,9 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, unsigned ofs, do_pages, u; size_t copied; - start_idx = idx = pos >> PAGE_CACHE_SHIFT; - ofs = pos & ~PAGE_CACHE_MASK; - bytes = PAGE_CACHE_SIZE - ofs; + start_idx = idx = pos >> PAGE_SHIFT; + ofs = pos & ~PAGE_MASK; + bytes = PAGE_SIZE - ofs; do_pages = 1; if (nr_pages > 1) { vcn = pos >> vol->cluster_size_bits; @@ -1832,7 +1832,7 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, if (lcn == LCN_HOLE) { start_idx = (pos & ~(s64) vol->cluster_size_mask) - >> PAGE_CACHE_SHIFT; + >> PAGE_SHIFT; bytes = vol->cluster_size - (pos & vol->cluster_size_mask); do_pages = nr_pages; @@ -1871,12 +1871,12 @@ again: if (unlikely(status)) { do { unlock_page(pages[--do_pages]); - page_cache_release(pages[do_pages]); + put_page(pages[do_pages]); } while (do_pages); break; } } - u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index; + u = (pos >> PAGE_SHIFT) - pages[0]->index; copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, i, bytes); ntfs_flush_dcache_pages(pages + u, do_pages - u); @@ -1889,7 +1889,7 @@ again: } do { unlock_page(pages[--do_pages]); - page_cache_release(pages[do_pages]); + put_page(pages[do_pages]); } while (do_pages); if (unlikely(status < 0)) break; @@ -1921,7 +1921,7 @@ again: } } while (iov_iter_count(i)); if (cached_page) - page_cache_release(cached_page); + put_page(cached_page); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); @@ -1952,12 +1952,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; inode_unlock(vi); - if (likely(written > 0)) { - err = generic_write_sync(file, iocb->ki_pos, written); - if (err < 0) - written = 0; - } iocb->ki_pos += written; + if (likely(written > 0)) + written = generic_write_sync(iocb, written); return written ? written : err; } diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c index 096c135691ae..0d645f357930 100644 --- a/fs/ntfs/index.c +++ b/fs/ntfs/index.c @@ -272,11 +272,11 @@ done: descend_into_child_node: /* * Convert vcn to index into the index allocation attribute in units - * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * of PAGE_SIZE and map the page cache page, reading it from * disk if necessary. */ page = ntfs_map_page(ia_mapping, vcn << - idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); if (IS_ERR(page)) { ntfs_error(sb, "Failed to map index page, error %ld.", -PTR_ERR(page)); @@ -288,9 +288,9 @@ descend_into_child_node: fast_descend_into_child_node: /* Get to the index allocation block. */ ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { ntfs_error(sb, "Out of bounds check failed. Corrupt inode " "0x%lx or driver bug.", idx_ni->mft_no); goto unm_err_out; @@ -323,7 +323,7 @@ fast_descend_into_child_node: goto unm_err_out; } index_end = (u8*)ia + idx_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_CACHE_SIZE) { + if (index_end > kaddr + PAGE_SIZE) { ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " "crosses page boundary. Impossible! Cannot " "access! This is probably a bug in the " @@ -427,9 +427,9 @@ ia_done: * the mapped page. */ if (old_vcn << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT == vcn << + PAGE_SHIFT == vcn << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT) + PAGE_SHIFT) goto fast_descend_into_child_node; unlock_page(page); ntfs_unmap_page(page); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d284f07eda77..f40972d6df90 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -868,12 +868,12 @@ skip_attr_list_load: ni->itype.index.block_size); goto unm_err_out; } - if (ni->itype.index.block_size > PAGE_CACHE_SIZE) { + if (ni->itype.index.block_size > PAGE_SIZE) { ntfs_error(vi->i_sb, "Index block size (%u) > " - "PAGE_CACHE_SIZE (%ld) is not " + "PAGE_SIZE (%ld) is not " "supported. Sorry.", ni->itype.index.block_size, - PAGE_CACHE_SIZE); + PAGE_SIZE); err = -EOPNOTSUPP; goto unm_err_out; } @@ -1585,10 +1585,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) "two.", ni->itype.index.block_size); goto unm_err_out; } - if (ni->itype.index.block_size > PAGE_CACHE_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE " + if (ni->itype.index.block_size > PAGE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE " "(%ld) is not supported. Sorry.", - ni->itype.index.block_size, PAGE_CACHE_SIZE); + ni->itype.index.block_size, PAGE_SIZE); err = -EOPNOTSUPP; goto unm_err_out; } diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c index 1711b710b641..27a24a42f712 100644 --- a/fs/ntfs/lcnalloc.c +++ b/fs/ntfs/lcnalloc.c @@ -283,15 +283,15 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, ntfs_unmap_page(page); } page = ntfs_map_page(mapping, last_read_pos >> - PAGE_CACHE_SHIFT); + PAGE_SHIFT); if (IS_ERR(page)) { err = PTR_ERR(page); ntfs_error(vol->sb, "Failed to map page."); goto out; } - buf_size = last_read_pos & ~PAGE_CACHE_MASK; + buf_size = last_read_pos & ~PAGE_MASK; buf = page_address(page) + buf_size; - buf_size = PAGE_CACHE_SIZE - buf_size; + buf_size = PAGE_SIZE - buf_size; if (unlikely(last_read_pos + buf_size > i_size)) buf_size = i_size - last_read_pos; buf_size <<= 3; diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index c71de292c5ad..9d71213ca81e 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -381,7 +381,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, * completely inside @rp, just copy it from there. Otherwise map all * the required pages and copy the data from them. */ - size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK); + size = PAGE_SIZE - (pos & ~PAGE_MASK); if (size >= le32_to_cpu(rp->system_page_size)) { memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); } else { @@ -394,8 +394,8 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, /* Copy the remaining data one page at a time. */ have_read = size; to_read = le32_to_cpu(rp->system_page_size) - size; - idx = (pos + size) >> PAGE_CACHE_SHIFT; - BUG_ON((pos + size) & ~PAGE_CACHE_MASK); + idx = (pos + size) >> PAGE_SHIFT; + BUG_ON((pos + size) & ~PAGE_MASK); do { page = ntfs_map_page(vi->i_mapping, idx); if (IS_ERR(page)) { @@ -406,7 +406,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, err = -EIO; goto err_out; } - size = min_t(int, to_read, PAGE_CACHE_SIZE); + size = min_t(int, to_read, PAGE_SIZE); memcpy((u8*)trp + have_read, page_address(page), size); ntfs_unmap_page(page); have_read += size; @@ -509,11 +509,11 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) * log page size if the page cache size is between the default log page * size and twice that. */ - if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <= + if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2) log_page_size = DefaultLogPageSize; else - log_page_size = PAGE_CACHE_SIZE; + log_page_size = PAGE_SIZE; log_page_mask = log_page_size - 1; /* * Use ntfs_ffs() instead of ffs() to enable the compiler to @@ -539,7 +539,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) * to be empty. */ for (pos = 0; pos < size; pos <<= 1) { - pgoff_t idx = pos >> PAGE_CACHE_SHIFT; + pgoff_t idx = pos >> PAGE_SHIFT; if (!page || page->index != idx) { if (page) ntfs_unmap_page(page); @@ -550,7 +550,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) goto err_out; } } - kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK); + kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK); /* * A non-empty block means the logfile is not empty while an * empty block after a non-empty block has been encountered diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3014a36a255b..37b2501caaa4 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -61,16 +61,16 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) * here if the volume was that big... */ index = (u64)ni->mft_no << vol->mft_record_size_bits >> - PAGE_CACHE_SHIFT; - ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + PAGE_SHIFT; + ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; i_size = i_size_read(mft_vi); /* The maximum valid index into the page cache for $MFT's data. */ - end_index = i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_SHIFT; /* If the wanted index is out of bounds the mft record doesn't exist. */ if (unlikely(index >= end_index)) { - if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs + + if (index > end_index || (i_size & ~PAGE_MASK) < ofs + vol->mft_record_size) { page = ERR_PTR(-ENOENT); ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " @@ -487,7 +487,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, } /* Get the page containing the mirror copy of the mft record @m. */ page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> - (PAGE_CACHE_SHIFT - vol->mft_record_size_bits)); + (PAGE_SHIFT - vol->mft_record_size_bits)); if (IS_ERR(page)) { ntfs_error(vol->sb, "Failed to map mft mirror page."); err = PTR_ERR(page); @@ -497,7 +497,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, BUG_ON(!PageUptodate(page)); ClearPageUptodate(page); /* Offset of the mft mirror record inside the page. */ - page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; /* The address in the page of the mirror copy of the mft record @m. */ kmirr = page_address(page) + page_ofs; /* Copy the mst protected mft record to the mirror. */ @@ -1178,8 +1178,8 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, for (; pass <= 2;) { /* Cap size to pass_end. */ ofs = data_pos >> 3; - page_ofs = ofs & ~PAGE_CACHE_MASK; - size = PAGE_CACHE_SIZE - page_ofs; + page_ofs = ofs & ~PAGE_MASK; + size = PAGE_SIZE - page_ofs; ll = ((pass_end + 7) >> 3) - ofs; if (size > ll) size = ll; @@ -1190,7 +1190,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, */ if (size) { page = ntfs_map_page(mftbmp_mapping, - ofs >> PAGE_CACHE_SHIFT); + ofs >> PAGE_SHIFT); if (IS_ERR(page)) { ntfs_error(vol->sb, "Failed to read mft " "bitmap, aborting."); @@ -1328,13 +1328,13 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) */ ll = lcn >> 3; page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, - ll >> PAGE_CACHE_SHIFT); + ll >> PAGE_SHIFT); if (IS_ERR(page)) { up_write(&mftbmp_ni->runlist.lock); ntfs_error(vol->sb, "Failed to read from lcn bitmap."); return PTR_ERR(page); } - b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK); + b = (u8*)page_address(page) + (ll & ~PAGE_MASK); tb = 1 << (lcn & 7ull); down_write(&vol->lcnbmp_lock); if (*b != 0xff && !(*b & tb)) { @@ -2103,14 +2103,14 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) * The index into the page cache and the offset within the page cache * page of the wanted mft record. */ - index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; - ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT; + ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; /* The maximum valid index into the page cache for $MFT's data. */ i_size = i_size_read(mft_vi); - end_index = i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_SHIFT; if (unlikely(index >= end_index)) { if (unlikely(index > end_index || ofs + vol->mft_record_size >= - (i_size & ~PAGE_CACHE_MASK))) { + (i_size & ~PAGE_MASK))) { ntfs_error(vol->sb, "Tried to format non-existing mft " "record 0x%llx.", (long long)mft_no); return -ENOENT; @@ -2515,8 +2515,8 @@ mft_rec_already_initialized: * We now have allocated and initialized the mft record. Calculate the * index of and the offset within the page cache page the record is in. */ - index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; - ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + index = bit << vol->mft_record_size_bits >> PAGE_SHIFT; + ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK; /* Read, map, and pin the page containing the mft record. */ page = ntfs_map_page(vol->mft_ino->i_mapping, index); if (IS_ERR(page)) { diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h index c581e26a350d..12de47b96ca9 100644 --- a/fs/ntfs/ntfs.h +++ b/fs/ntfs/ntfs.h @@ -43,7 +43,7 @@ typedef enum { NTFS_MAX_NAME_LEN = 255, NTFS_MAX_ATTR_NAME_LEN = 255, NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ - NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE, + NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE, } NTFS_CONSTANTS; /* Global variables. */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 1b38abdaa3ed..ecb49870a680 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -823,14 +823,14 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", vol->mft_record_size_bits, vol->mft_record_size_bits); /* - * We cannot support mft record sizes above the PAGE_CACHE_SIZE since + * We cannot support mft record sizes above the PAGE_SIZE since * we store $MFT/$DATA, the table of mft records in the page cache. */ - if (vol->mft_record_size > PAGE_CACHE_SIZE) { + if (vol->mft_record_size > PAGE_SIZE) { ntfs_error(vol->sb, "Mft record size (%i) exceeds the " - "PAGE_CACHE_SIZE on your system (%lu). " + "PAGE_SIZE on your system (%lu). " "This is not supported. Sorry.", - vol->mft_record_size, PAGE_CACHE_SIZE); + vol->mft_record_size, PAGE_SIZE); return false; } /* We cannot support mft record sizes below the sector size. */ @@ -1096,7 +1096,7 @@ static bool check_mft_mirror(ntfs_volume *vol) ntfs_debug("Entering."); /* Compare contents of $MFT and $MFTMirr. */ - mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size; + mrecs_per_page = PAGE_SIZE / vol->mft_record_size; BUG_ON(!mrecs_per_page); BUG_ON(!vol->mftmirr_size); mft_page = mirr_page = NULL; @@ -1615,20 +1615,20 @@ static bool load_and_init_attrdef(ntfs_volume *vol) if (!vol->attrdef) goto iput_failed; index = 0; - max_index = i_size >> PAGE_CACHE_SHIFT; - size = PAGE_CACHE_SIZE; + max_index = i_size >> PAGE_SHIFT; + size = PAGE_SIZE; while (index < max_index) { /* Read the attrdef table and copy it into the linear buffer. */ read_partial_attrdef_page: page = ntfs_map_page(ino->i_mapping, index); if (IS_ERR(page)) goto free_iput_failed; - memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT), + memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); }; - if (size == PAGE_CACHE_SIZE) { - size = i_size & ~PAGE_CACHE_MASK; + if (size == PAGE_SIZE) { + size = i_size & ~PAGE_MASK; if (size) goto read_partial_attrdef_page; } @@ -1684,20 +1684,20 @@ static bool load_and_init_upcase(ntfs_volume *vol) if (!vol->upcase) goto iput_upcase_failed; index = 0; - max_index = i_size >> PAGE_CACHE_SHIFT; - size = PAGE_CACHE_SIZE; + max_index = i_size >> PAGE_SHIFT; + size = PAGE_SIZE; while (index < max_index) { /* Read the upcase table and copy it into the linear buffer. */ read_partial_upcase_page: page = ntfs_map_page(ino->i_mapping, index); if (IS_ERR(page)) goto iput_upcase_failed; - memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT), + memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); }; - if (size == PAGE_CACHE_SIZE) { - size = i_size & ~PAGE_CACHE_MASK; + if (size == PAGE_SIZE) { + size = i_size & ~PAGE_MASK; if (size) goto read_partial_upcase_page; } @@ -2471,14 +2471,14 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) down_read(&vol->lcnbmp_lock); /* * Convert the number of bits into bytes rounded up, then convert into - * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one + * multiples of PAGE_SIZE, rounding up so that if we have one * full and one partial page max_index = 2. */ - max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ + max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", - max_index, PAGE_CACHE_SIZE / 4); + max_index, PAGE_SIZE / 4); for (index = 0; index < max_index; index++) { unsigned long *kaddr; @@ -2491,7 +2491,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) if (IS_ERR(page)) { ntfs_debug("read_mapping_page() error. Skipping " "page (index 0x%lx).", index); - nr_free -= PAGE_CACHE_SIZE * 8; + nr_free -= PAGE_SIZE * 8; continue; } kaddr = kmap_atomic(page); @@ -2503,9 +2503,9 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) * ntfs_readpage(). */ nr_free -= bitmap_weight(kaddr, - PAGE_CACHE_SIZE * BITS_PER_BYTE); + PAGE_SIZE * BITS_PER_BYTE); kunmap_atomic(kaddr); - page_cache_release(page); + put_page(page); } ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); /* @@ -2547,9 +2547,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, pgoff_t index; ntfs_debug("Entering."); - /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ + /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " - "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); + "0x%lx.", max_index, PAGE_SIZE / 4); for (index = 0; index < max_index; index++) { unsigned long *kaddr; @@ -2562,7 +2562,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, if (IS_ERR(page)) { ntfs_debug("read_mapping_page() error. Skipping " "page (index 0x%lx).", index); - nr_free -= PAGE_CACHE_SIZE * 8; + nr_free -= PAGE_SIZE * 8; continue; } kaddr = kmap_atomic(page); @@ -2574,9 +2574,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, * ntfs_readpage(). */ nr_free -= bitmap_weight(kaddr, - PAGE_CACHE_SIZE * BITS_PER_BYTE); + PAGE_SIZE * BITS_PER_BYTE); kunmap_atomic(kaddr); - page_cache_release(page); + put_page(page); } ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", index - 1); @@ -2618,17 +2618,17 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) /* Type of filesystem. */ sfs->f_type = NTFS_SB_MAGIC; /* Optimal transfer block size. */ - sfs->f_bsize = PAGE_CACHE_SIZE; + sfs->f_bsize = PAGE_SIZE; /* * Total data blocks in filesystem in units of f_bsize and since * inodes are also stored in data blocs ($MFT is a file) this is just * the total clusters. */ sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; /* Free data blocks in filesystem in units of f_bsize. */ size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; if (size < 0LL) size = 0LL; /* Free blocks avail to non-superuser, same as above on NTFS. */ @@ -2639,11 +2639,11 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; /* * Convert the maximum number of set bits into bytes rounded up, then - * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we + * convert into multiples of PAGE_SIZE, rounding up so that if we * have one full and one partial page max_index = 2. */ max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) - + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; read_unlock_irqrestore(&mft_ni->size_lock, flags); /* Number of inodes in filesystem (at this point in time). */ sfs->f_files = size; @@ -2765,15 +2765,15 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) if (!parse_options(vol, (char*)opt)) goto err_out_now; - /* We support sector sizes up to the PAGE_CACHE_SIZE. */ - if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) { + /* We support sector sizes up to the PAGE_SIZE. */ + if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) { if (!silent) ntfs_error(sb, "Device has unsupported sector size " "(%i). The maximum supported sector " "size on this architecture is %lu " "bytes.", bdev_logical_block_size(sb->s_bdev), - PAGE_CACHE_SIZE); + PAGE_SIZE); goto err_out_now; } /* diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 0cdf497c91ef..2162434728c0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) brelse(di_bh); return acl; } + +int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + acl, NULL, NULL); + posix_acl_release(acl); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0, ret2; + umode_t mode; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) { + mode = inode->i_mode & ~current_umask(); + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + mode = inode->i_mode; + ret = __posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; + + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + acl, meta_ac, data_ac); + } + } +cleanup: + posix_acl_release(acl); + return ret; +} diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 3fce68d08625..2783a75b3999 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle, struct posix_acl *acl, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac); +extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 70907d638b60..460c0cedab3a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle, { int ret; u32 left_cpos, rec_range, trunc_range; - int wants_rotate = 0, is_rightmost_tree_rec = 0; + int is_rightmost_tree_rec = 0; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle, memset(rec, 0, sizeof(*rec)); ocfs2_cleanup_merge(el, index); - wants_rotate = 1; next_free = le16_to_cpu(el->l_next_free_rec); if (is_rightmost_tree_rec && next_free > 1) { @@ -6671,7 +6670,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, { int i; struct page *page; - unsigned int from, to = PAGE_CACHE_SIZE; + unsigned int from, to = PAGE_SIZE; struct super_block *sb = inode->i_sb; BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); @@ -6679,21 +6678,21 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, if (numpages == 0) goto out; - to = PAGE_CACHE_SIZE; + to = PAGE_SIZE; for(i = 0; i < numpages; i++) { page = pages[i]; - from = start & (PAGE_CACHE_SIZE - 1); - if ((end >> PAGE_CACHE_SHIFT) == page->index) - to = end & (PAGE_CACHE_SIZE - 1); + from = start & (PAGE_SIZE - 1); + if ((end >> PAGE_SHIFT) == page->index) + to = end & (PAGE_SIZE - 1); - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > PAGE_SIZE); + BUG_ON(to > PAGE_SIZE); ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, &phys); - start = (page->index + 1) << PAGE_CACHE_SHIFT; + start = (page->index + 1) << PAGE_SHIFT; } out: if (pages) @@ -6712,7 +6711,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, numpages = 0; last_page_bytes = PAGE_ALIGN(end); - index = start >> PAGE_CACHE_SHIFT; + index = start >> PAGE_SHIFT; do { pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { @@ -6723,7 +6722,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, numpages++; index++; - } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); + } while (index < (last_page_bytes >> PAGE_SHIFT)); out: if (ret != 0) { @@ -6950,8 +6949,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * to do that now. */ if (!ocfs2_sparse_alloc(osb) && - PAGE_CACHE_SIZE < osb->s_clustersize) - end = PAGE_CACHE_SIZE; + PAGE_SIZE < osb->s_clustersize) + end = PAGE_SIZE; ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { @@ -6971,8 +6970,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_unlock; } - page_end = PAGE_CACHE_SIZE; - if (PAGE_CACHE_SIZE > osb->s_clustersize) + page_end = PAGE_SIZE; + if (PAGE_SIZE > osb->s_clustersize) page_end = osb->s_clustersize; for (i = 0; i < num_pages; i++) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1581240a7ca0..c034edf3ef38 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -234,7 +234,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); - if (size > PAGE_CACHE_SIZE || + if (size > PAGE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu\n", @@ -247,7 +247,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size) memcpy(kaddr, di->id2.i_data.id_data, size); /* Clear the remaining part of the page */ - memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); + memset(kaddr + size, 0, PAGE_SIZE - size); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -282,7 +282,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + loff_t start = (loff_t)page->index << PAGE_SHIFT; int ret, unlock = 1; trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, @@ -385,7 +385,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, * drop out in that case as it's not worth handling here. */ last = list_entry(pages->prev, struct page, lru); - start = (loff_t)last->index << PAGE_CACHE_SHIFT; + start = (loff_t)last->index << PAGE_SHIFT; if (start >= i_size_read(inode)) goto out_unlock; @@ -511,12 +511,12 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, unsigned int *start, unsigned int *end) { - unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + unsigned int cluster_start = 0, cluster_end = PAGE_SIZE; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) { unsigned int cpp; - cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits); cluster_start = cpos % cpp; cluster_start = cluster_start << osb->s_clustersize_bits; @@ -684,13 +684,13 @@ next_bh: return ret; } -#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE) #define OCFS2_MAX_CTXT_PAGES 1 #else -#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE) #endif -#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE) struct ocfs2_unwritten_extent { struct list_head ue_node; @@ -785,7 +785,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) if (pages[i]) { unlock_page(pages[i]); mark_page_accessed(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } } } @@ -808,7 +808,7 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) } } mark_page_accessed(wc->w_target_page); - page_cache_release(wc->w_target_page); + put_page(wc->w_target_page); } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); } @@ -857,7 +857,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, wc->w_di_bh = di_bh; wc->w_type = type; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) wc->w_large_pages = 1; else wc->w_large_pages = 0; @@ -920,7 +920,7 @@ static void ocfs2_write_failure(struct inode *inode, loff_t user_pos, unsigned user_len) { int i; - unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), + unsigned from = user_pos & (PAGE_SIZE - 1), to = user_pos + user_len; struct page *tmppage; @@ -960,7 +960,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, (page_offset(page) <= user_pos)); if (page == wc->w_target_page) { - map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_from = user_pos & (PAGE_SIZE - 1); map_to = map_from + user_len; if (new) @@ -1034,7 +1034,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct inode *inode = mapping->host; loff_t last_byte; - target_index = user_pos >> PAGE_CACHE_SHIFT; + target_index = user_pos >> PAGE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For @@ -1053,14 +1053,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ last_byte = max(user_pos + user_len, i_size_read(inode)); BUG_ON(last_byte < 1); - end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1; if ((start + wc->w_num_pages) > end_index) wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; } - end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT; + end_index = (user_pos + user_len - 1) >> PAGE_SHIFT; for(i = 0; i < wc->w_num_pages; i++) { index = start + i; @@ -1082,7 +1082,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, goto out; } - page_cache_get(mmap_page); + get_page(mmap_page); wc->w_pages[i] = mmap_page; wc->w_target_locked = true; } else if (index >= target_index && index <= end_index && @@ -1272,7 +1272,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, { struct ocfs2_write_cluster_desc *desc; - wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_from = pos & (PAGE_SIZE - 1); wc->w_target_to = wc->w_target_from + len; if (alloc == 0) @@ -1309,7 +1309,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, &wc->w_target_to); } else { wc->w_target_from = 0; - wc->w_target_to = PAGE_CACHE_SIZE; + wc->w_target_to = PAGE_SIZE; } } @@ -1981,7 +1981,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, struct page *page, void *fsdata) { int i, ret; - unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + unsigned from, to, start = pos & (PAGE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_write_ctxt *wc = fsdata; @@ -2027,8 +2027,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, from = wc->w_target_from; to = wc->w_target_to; - BUG_ON(from > PAGE_CACHE_SIZE || - to > PAGE_CACHE_SIZE || + BUG_ON(from > PAGE_SIZE || + to > PAGE_SIZE || to < from); } else { /* @@ -2037,7 +2037,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, * to flush their entire range. */ from = 0; - to = PAGE_CACHE_SIZE; + to = PAGE_SIZE; } if (page_has_buffers(tmppage)) { @@ -2311,7 +2311,7 @@ static void ocfs2_dio_end_io_write(struct inode *inode, /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we * are in that context. */ if (dwc->dw_writer_pid != task_pid_nr(current)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); locked = 1; } @@ -2390,7 +2390,7 @@ out: ocfs2_free_alloc_context(meta_ac); ocfs2_run_deallocs(osb, &dealloc); if (locked) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); } @@ -2423,13 +2423,11 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, return 0; } -static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - loff_t end = offset + iter->count; get_block_t *get_block; /* @@ -2440,7 +2438,8 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; /* Fallback to buffered I/O if we do not support append dio. */ - if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb)) + if (iocb->ki_pos + iter->count > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) return 0; if (iov_iter_rw(iter) == READ) @@ -2449,7 +2448,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block = ocfs2_dio_get_block; return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, offset, get_block, + iter, get_block, ocfs2_dio_end_io, NULL, 0); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bd15929b5f92..a8d15beee5cb 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -417,13 +417,13 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - vec_start = (cs << bits) % PAGE_CACHE_SIZE; + vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { current_page = cs / spp; page = reg->hr_slot_data[current_page]; - vec_len = min(PAGE_CACHE_SIZE - vec_start, - (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); + vec_len = min(PAGE_SIZE - vec_start, + (max_slots-cs) * (PAGE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", current_page, vec_len, vec_start); @@ -431,7 +431,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, len = bio_add_page(bio, page, vec_len, vec_start); if (len != vec_len) break; - cs += vec_len / (PAGE_CACHE_SIZE/spp); + cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; } @@ -1456,7 +1456,6 @@ static void o2hb_region_release(struct config_item *item) static int o2hb_read_block_input(struct o2hb_region *reg, const char *page, - size_t count, unsigned long *ret_bytes, unsigned int *ret_bits) { @@ -1499,8 +1498,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, if (reg->hr_bdev) return -EINVAL; - status = o2hb_read_block_input(reg, page, count, - &block_bytes, &block_bits); + status = o2hb_read_block_input(reg, page, &block_bytes, + &block_bits); if (status) return status; @@ -1576,7 +1575,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) static void o2hb_init_region_params(struct o2hb_region *reg) { - reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits; reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2d0acd6678fe..4238eb28889f 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -600,10 +600,11 @@ static void o2net_set_nn_state(struct o2net_node *nn, static void o2net_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); + struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); - if (sk->sk_user_data) { - struct o2net_sock_container *sc = sk->sk_user_data; + read_lock_bh(&sk->sk_callback_lock); + sc = sk->sk_user_data; + if (sc) { sclog(sc, "data_ready hit\n"); o2net_set_data_ready_time(sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); @@ -611,7 +612,7 @@ static void o2net_data_ready(struct sock *sk) } else { ready = sk->sk_data_ready; } - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); ready(sk); } @@ -622,7 +623,7 @@ static void o2net_state_change(struct sock *sk) void (*state_change)(struct sock *sk); struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); sc = sk->sk_user_data; if (sc == NULL) { state_change = sk->sk_state_change; @@ -649,7 +650,7 @@ static void o2net_state_change(struct sock *sk) break; } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); state_change(sk); } @@ -2012,7 +2013,7 @@ static void o2net_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -2039,7 +2040,7 @@ static void o2net_listen_data_ready(struct sock *sk) } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); if (ready != NULL) ready(sk); } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9aed6e202201..13719d3f35f8 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2455,6 +2455,8 @@ int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->spinlock); + ret = 0; + done: dlm_put(dlm); return ret; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 03768bb3aab1..47b3b2d4e775 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -571,8 +571,8 @@ static int dlmfs_fill_super(struct super_block * sb, int silent) { sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = DLMFS_MAGIC; sb->s_op = &dlmfs_ops; sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 474e57f834e6..1eaa9100c889 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -54,6 +54,7 @@ #include "uptodate.h" #include "quota.h" #include "refcounttree.h" +#include "acl.h" #include "buffer_head_io.h" @@ -3623,6 +3624,8 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, filemap_fdatawait(mapping); } + forget_all_cached_acls(inode); + out: return UNBLOCK_CONTINUE; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c18ab45f8d21..4e7b0dc22450 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -770,14 +770,14 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index = abs_from >> PAGE_CACHE_SHIFT; + unsigned long index = abs_from >> PAGE_SHIFT; handle_t *handle; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); - BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); @@ -794,10 +794,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } /* Get the offsets within the page that we want to zero */ - zero_from = abs_from & (PAGE_CACHE_SIZE - 1); - zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + zero_from = abs_from & (PAGE_SIZE - 1); + zero_to = abs_to & (PAGE_SIZE - 1); if (!zero_to) - zero_to = PAGE_CACHE_SIZE; + zero_to = PAGE_SIZE; trace_ocfs2_write_zero_page( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -851,7 +851,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, out_unlock: unlock_page(page); - page_cache_release(page); + put_page(page); out_commit_trans: if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); @@ -959,7 +959,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, BUG_ON(range_start >= range_end); while (zero_pos < range_end) { - next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE; if (next_pos > range_end) next_pos = range_end; rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); @@ -1268,20 +1268,20 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - brelse(bh); /* Release quota pointers in case we acquired them */ for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = posix_acl_chmod(inode, inode->i_mode); + status = ocfs2_acl_chmod(inode, bh); if (status < 0) mlog_errno(status); } if (inode_locked) ocfs2_inode_unlock(inode, 1); + brelse(bh); return status; } @@ -1290,7 +1290,7 @@ int ocfs2_getattr(struct vfsmount *mnt, struct kstat *stat) { struct inode *inode = d_inode(dentry); - struct super_block *sb = d_inode(dentry)->i_sb; + struct super_block *sb = dentry->d_sb; struct ocfs2_super *osb = sb->s_fs_info; int err; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 12f4a9e9800f..c56a7679df93 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } if (is_bad_inode(inode)) { iput(inode); - if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || - (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) - /* Return OCFS2_FILECHECK_ERR_XXX related errno */ - inode = ERR_PTR(rc); - else - inode = ERR_PTR(-ESTALE); + inode = ERR_PTR(rc); goto bail; } @@ -262,7 +257,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; if (args->fi_sysfile_type != 0) - lockdep_set_class(&inode->i_mutex, + lockdep_set_class(&inode->i_rwsem, &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index f4cd3c3e9fb7..497a4171ef61 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) { - return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9ea081f4e6e4..71545ad4628c 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -65,13 +65,13 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); - unsigned int len = PAGE_CACHE_SIZE; + unsigned int len = PAGE_SIZE; pgoff_t last_index; struct page *locked_page = NULL; void *fsdata; loff_t size = i_size_read(inode); - last_index = (size - 1) >> PAGE_CACHE_SHIFT; + last_index = (size - 1) >> PAGE_SHIFT; /* * There are cases that lead to the page no longer bebongs to the @@ -102,7 +102,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; + len = ((size - 1) & ~PAGE_MASK) + 1; ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, &locked_page, &fsdata, di_bh, page); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6b3e87189a64..a8f1225e6d9b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; - struct posix_acl *default_acl = NULL, *acl = NULL; struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, @@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (status) { - mlog_errno(status); - goto leave; - } - handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - if (default_acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_DEFAULT, default_acl, - meta_ac, data_ac); - } - if (!status && acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_ACCESS, acl, - meta_ac, data_ac); - } + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + meta_ac, data_ac); if (status < 0) { mlog_errno(status); @@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 6cf6538a0651..e63af7ddfe68 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -822,10 +822,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; - if (unlikely(PAGE_CACHE_SHIFT > cbits)) - clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); - else if (PAGE_CACHE_SHIFT < cbits) - clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + if (unlikely(PAGE_SHIFT > cbits)) + clusters = pg_index << (PAGE_SHIFT - cbits); + else if (PAGE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } @@ -839,10 +839,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; - if (PAGE_CACHE_SHIFT > cbits) { - index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits); - } else if (PAGE_CACHE_SHIFT < cbits) { - index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT > cbits) { + index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); + } else if (PAGE_SHIFT < cbits) { + index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; @@ -853,8 +853,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; - if (PAGE_CACHE_SHIFT < cbits) - pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 540ab5b75dbb..44d178b8d1aa 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -580,7 +580,7 @@ struct ocfs2_extended_slot { /*00*/ __u8 es_valid; __u8 es_reserved1[3]; __le32 es_node_num; -/*10*/ +/*08*/ }; /* diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 3892f3c079ca..ab6a6cdcf91c 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -867,6 +867,10 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) int status = 0; trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); + if (!sb_has_quota_loaded(sb, type)) { + status = -ESRCH; + goto out; + } status = ocfs2_lock_global_qf(info, 0); if (status < 0) goto out; @@ -878,8 +882,11 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) out_global: ocfs2_unlock_global_qf(info, 0); out: - /* Avoid logging ENOENT since it just means there isn't next ID */ - if (status && status != -ENOENT) + /* + * Avoid logging ENOENT since it just means there isn't next ID and + * ESRCH which means quota isn't enabled for the filesystem. + */ + if (status && status != -ENOENT && status != -ESRCH) mlog_errno(status); return status; } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3eff031aaf26..92bbe93bfe10 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2937,16 +2937,16 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, end = i_size_read(inode); while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + page_index = offset >> PAGE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; /* from, to is the offset within the page. */ - from = offset & (PAGE_CACHE_SIZE - 1); - to = PAGE_CACHE_SIZE; - if (map_end & (PAGE_CACHE_SIZE - 1)) - to = map_end & (PAGE_CACHE_SIZE - 1); + from = offset & (PAGE_SIZE - 1); + to = PAGE_SIZE; + if (map_end & (PAGE_SIZE - 1)) + to = map_end & (PAGE_SIZE - 1); page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { @@ -2956,10 +2956,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } /* - * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page + * In case PAGE_SIZE <= CLUSTER_SIZE, This page * can't be dirtied before we CoW it out. */ - if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); if (!PageUptodate(page)) { @@ -2987,7 +2987,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, mark_page_accessed(page); unlock: unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; offset = map_end; if (ret) @@ -3165,8 +3165,8 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, } while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + page_index = offset >> PAGE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; @@ -3182,7 +3182,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, mark_page_accessed(page); unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; offset = map_end; if (ret) @@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; - struct posix_acl *default_acl, *acl; - umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - mode = inode->i_mode; - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) { - mlog_errno(error); - return error; - } - error = ocfs2_create_inode_in_orphan(dir, mode, + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name, - default_acl, acl); + &new_dentry->d_name); if (error) mlog_errno(error); } out: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1e09592148ad..d7407994f308 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb) spin_unlock(&osb->osb_lock); status = ocfs2_update_disk_slot(osb, si, slot_num); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto bail; - } -bail: ocfs2_free_slot_info(osb); } - diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7db631e1c8b0..d7cae3327de5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -605,8 +605,8 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, /* * We might be limited by page cache size. */ - if (bytes > PAGE_CACHE_SIZE) { - bytes = PAGE_CACHE_SIZE; + if (bytes > PAGE_SIZE) { + bytes = PAGE_SIZE; trim = 1; /* * Shift by 31 here so that we don't get larger than diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 7d3d979f57d9..ad16995c9e7a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7216,12 +7216,10 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl) + const struct qstr *qstr) { - struct buffer_head *dir_bh = NULL; int ret = 0; + struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, mlog_errno(ret); goto leave; } - - if (!ret && default_acl) - ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); - if (!ret && acl) - ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); @@ -7250,10 +7246,10 @@ leave: * 'security' attributes support */ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -7321,10 +7317,10 @@ const struct xattr_handler ocfs2_xattr_security_handler = { * 'trusted' attributes support */ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -7346,14 +7342,14 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = { * 'user' attributes support */ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unusde, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, buffer, size); } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index f10d5b93c366..1633cc15ea1f 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl); + const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index f833bf8d5792..c8cbf3b60645 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -452,6 +452,6 @@ const struct inode_operations omfs_dir_inops = { const struct file_operations omfs_dir_operations = { .read = generic_read_dir, - .iterate = omfs_readdir, + .iterate_shared = omfs_readdir, .llseek = generic_file_llseek, }; diff --git a/fs/open.c b/fs/open.c index 17cb6b1dab75..93ae3cdee4ab 100644 --- a/fs/open.c +++ b/fs/open.c @@ -65,7 +65,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, return ret; } -long vfs_truncate(struct path *path, loff_t length) +long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; long error; @@ -499,7 +499,7 @@ out: return error; } -static int chmod_common(struct path *path, umode_t mode) +static int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; @@ -564,7 +564,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return sys_fchmodat(AT_FDCWD, filename, mode); } -static int chown_common(struct path *path, uid_t user, gid_t group) +static int chown_common(const struct path *path, uid_t user, gid_t group) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; @@ -713,7 +713,7 @@ static int do_dentry_open(struct file *f, } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); @@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = path->dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = vfs_select_inode(path->dentry, file->f_flags); - file->f_path = *path; - if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { - inode = dentry->d_op->d_select_inode(dentry, file->f_flags); - if (IS_ERR(inode)) - return PTR_ERR(inode); - } + if (IS_ERR(inode)) + return PTR_ERR(inode); + file->f_path = *path; return do_dentry_open(file, inode, NULL, cred); } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index b61b883c8ff8..c7a86993d97e 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -166,7 +166,7 @@ static int openpromfs_readdir(struct file *, struct dir_context *); static const struct file_operations openprom_operations = { .read = generic_read_dir, - .iterate = openpromfs_readdir, + .iterate_shared = openpromfs_readdir, .llseek = generic_file_llseek, }; diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index f30b6ecacdd1..324f0af40d7b 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -153,7 +153,6 @@ static int orangefs_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file->f_path.dentry; struct orangefs_kernel_op_s *new_op = NULL; struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode); - int buffer_full = 0; struct orangefs_readdir_response_s readdir_response; void *dents_buf; int i = 0; @@ -235,7 +234,7 @@ get_new_buffer_index: if (ret == -EIO && op_state_purged(new_op)) { gossip_err("%s: Client is down. Aborting readdir call.\n", __func__); - goto out_slot; + goto out_free_op; } if (ret < 0 || new_op->downcall.status != 0) { @@ -244,14 +243,14 @@ get_new_buffer_index: new_op->downcall.status); if (ret >= 0) ret = new_op->downcall.status; - goto out_slot; + goto out_free_op; } dents_buf = new_op->downcall.trailer_buf; if (dents_buf == NULL) { gossip_err("Invalid NULL buffer in readdir response\n"); ret = -ENOMEM; - goto out_slot; + goto out_free_op; } bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size, @@ -350,8 +349,7 @@ get_new_buffer_index: /* * Did we hit the end of the directory? */ - if (readdir_response.token == ORANGEFS_READDIR_END && - !buffer_full) { + if (readdir_response.token == ORANGEFS_READDIR_END) { gossip_debug(GOSSIP_DIR_DEBUG, "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n"); ctx->pos = ORANGEFS_READDIR_END; @@ -363,8 +361,6 @@ out_destroy_handle: out_vfree: gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf); vfree(dents_buf); -out_slot: - orangefs_readdir_index_put(buffer_index); out_free_op: op_release(new_op); gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index ae92795ed965..491e82c6f705 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -445,7 +445,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); - mutex_lock(&file->f_mapping->host->i_mutex); + inode_lock(file->f_mapping->host); /* Make sure generic_write_checks sees an up to date inode size. */ if (file->f_flags & O_APPEND) { @@ -492,7 +492,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite out: - mutex_unlock(&file->f_mapping->host->i_mutex); + inode_unlock(file->f_mapping->host); return rc; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 2382e267b49e..85640e955cde 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -18,8 +18,8 @@ static int read_one_page(struct page *page) int max_block; ssize_t bytes_read = 0; struct inode *inode = page->mapping->host; - const __u32 blocksize = PAGE_CACHE_SIZE; /* inode->i_blksize */ - const __u32 blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */ + const __u32 blocksize = PAGE_SIZE; /* inode->i_blksize */ + const __u32 blockbits = PAGE_SHIFT; /* inode->i_blkbits */ struct iov_iter to; struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE}; @@ -86,7 +86,7 @@ static int orangefs_readpages(struct file *file, "failure adding page to cache, read_one_page returned: %d\n", ret); } else { - page_cache_release(page); + put_page(page); } } BUG_ON(!list_empty(pages)); @@ -204,22 +204,8 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) if (ret != 0) return ret; - /* - * Only change the c/mtime if we are changing the size or we are - * explicitly asked to change it. This handles the semantic difference - * between truncate() and ftruncate() as implemented in the VFS. - * - * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a - * special case where we need to update the times despite not having - * these flags set. For all other operations the VFS set these flags - * explicitly if it wants a timestamp update. - */ - if (orig_size != i_size_read(inode) && - !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { - iattr->ia_ctime = iattr->ia_mtime = - current_fs_time(inode->i_sb); + if (orig_size != i_size_read(inode)) iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; - } return ret; } @@ -328,7 +314,7 @@ static int orangefs_init_iops(struct inode *inode) case S_IFREG: inode->i_op = &orangefs_file_inode_operations; inode->i_fop = &orangefs_file_operations; - inode->i_blkbits = PAGE_CACHE_SHIFT; + inode->i_blkbits = PAGE_SHIFT; break; case S_IFLNK: inode->i_op = &orangefs_symlink_inode_operations; @@ -456,7 +442,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_size = PAGE_CACHE_SIZE; + inode->i_size = PAGE_SIZE; inode->i_rdev = dev; error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref); diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 1f8acc9f9a88..75375e90a63f 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -170,7 +170,7 @@ orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap) int i; for (i = 0; i < bufmap->page_count; i++) - page_cache_release(bufmap->page_array[i]); + put_page(bufmap->page_array[i]); } static void @@ -299,7 +299,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, for (i = 0; i < ret; i++) { SetPageError(bufmap->page_array[i]); - page_cache_release(bufmap->page_array[i]); + put_page(bufmap->page_array[i]); } return -ENOMEM; } diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 19670b8b4053..1714a737d556 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -126,8 +126,7 @@ out: void orangefs_debugfs_cleanup(void) { - if (debug_dir) - debugfs_remove_recursive(debug_dir); + debugfs_remove_recursive(debug_dir); } /* open ORANGEFS_KMOD_DEBUG_HELP_FILE */ diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index a9925e296ceb..2281882f718e 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -612,11 +612,11 @@ do { \ static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - mutex_lock(&inode->i_mutex); + inode_lock(inode); #endif i_size_write(inode, i_size); #if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); #endif } diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 40f5163b56aa..2d129b5886ee 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -303,7 +303,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size) } break; case S_IFDIR: - inode->i_size = PAGE_CACHE_SIZE; + inode->i_size = PAGE_SIZE; orangefs_inode->blksize = (1 << inode->i_blkbits); spin_lock(&inode->i_lock); inode_set_bytes(inode, inode->i_size); @@ -315,9 +315,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size) inode->i_size = (loff_t)strlen(new_op-> downcall.resp.getattr.link_target); orangefs_inode->blksize = (1 << inode->i_blkbits); - strlcpy(orangefs_inode->link_target, + ret = strscpy(orangefs_inode->link_target, new_op->downcall.resp.getattr.link_target, ORANGEFS_NAME_MAX); + if (ret == -E2BIG) { + ret = -EIO; + goto out; + } inode->i_link = orangefs_inode->link_target; } break; diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 45ce4ff4cbc7..1efc6f8a5224 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -1,3 +1,4 @@ +#include <linux/kernel.h> #include <linux/types.h> #include <linux/spinlock_types.h> #include <linux/slab.h> @@ -74,8 +75,8 @@ static inline void ORANGEFS_khandle_to(const struct orangefs_khandle *kh, void *p, int size) { - memset(p, 0, size); memcpy(p, kh->u, 16); + memset(p + 16, 0, size - 16); } @@ -407,7 +408,7 @@ enum { * space. Zero signifies the upstream version of the kernel module. */ #define ORANGEFS_KERNEL_PROTO_VERSION 0 -#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20904 +#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20903 /* * describes memory regions to map in the ORANGEFS_DEV_MAP ioctl. @@ -427,26 +428,28 @@ struct ORANGEFS_dev_map_desc { /* gossip.h *****************************************************************/ #ifdef GOSSIP_DISABLE_DEBUG -#define gossip_debug(mask, format, f...) do {} while (0) +#define gossip_debug(mask, fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) #else extern __u64 gossip_debug_mask; extern struct client_debug_mask client_debug_mask; /* try to avoid function call overhead by checking masks in macro */ -#define gossip_debug(mask, format, f...) \ -do { \ - if (gossip_debug_mask & mask) \ - printk(format, ##f); \ +#define gossip_debug(mask, fmt, ...) \ +do { \ + if (gossip_debug_mask & (mask)) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ } while (0) #endif /* GOSSIP_DISABLE_DEBUG */ /* do file and line number printouts w/ the GNU preprocessor */ -#define gossip_ldebug(mask, format, f...) \ - gossip_debug(mask, "%s: " format, __func__, ##f) - -#define gossip_err printk -#define gossip_lerr(format, f...) \ - gossip_err("%s line %d: " format, \ - __FILE__, \ - __LINE__, \ - ##f) +#define gossip_ldebug(mask, fmt, ...) \ + gossip_debug(mask, "%s: " fmt, __func__, ##__VA_ARGS__) + +#define gossip_err pr_err +#define gossip_lerr(fmt, ...) \ + gossip_err("%s line %d: " fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index ef5da7538cd5..99c19545752c 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -73,10 +73,6 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, "%s: prefix %s name %s, buffer_size %zd\n", __func__, prefix, name, size); - if (name == NULL || (size > 0 && buffer == NULL)) { - gossip_err("orangefs_inode_getxattr: bogus NULL pointers\n"); - return -EINVAL; - } if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) { gossip_err("Invalid key length (%d)\n", (int)(strlen(name) + strlen(prefix))); @@ -146,8 +142,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, goto out_release_op; } - memset(buffer, 0, size); memcpy(buffer, new_op->downcall.resp.getxattr.val, length); + memset(buffer + length, 0, size - length); gossip_debug(GOSSIP_XATTR_DEBUG, "orangefs_inode_getxattr: inode %pU " "key %s key_sz %d, val_len %d\n", @@ -239,8 +235,7 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, "%s: prefix %s, name %s, buffer_size %zd\n", __func__, prefix, name, size); - if (size < 0 || - size >= ORANGEFS_MAX_XATTR_VALUELEN || + if (size >= ORANGEFS_MAX_XATTR_VALUELEN || flags < 0) { gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n", (int)size, @@ -248,12 +243,6 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, return -EINVAL; } - if (name == NULL || - (size > 0 && value == NULL)) { - gossip_err("orangefs_inode_setxattr: bogus NULL pointers!\n"); - return -EINVAL; - } - internal_flag = convert_to_internal_xattr_flags(flags); if (prefix) { @@ -353,10 +342,6 @@ ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size) gossip_err("%s: bogus NULL pointers\n", __func__); return -EINVAL; } - if (size < 0) { - gossip_err("Invalid size (%d)\n", (int)size); - return -EINVAL; - } down_read(&orangefs_inode->xattr_sem); new_op = op_alloc(ORANGEFS_VFS_OP_LISTXATTR); @@ -478,12 +463,13 @@ static int orangefs_xattr_set_default(const struct xattr_handler *handler, } static int orangefs_xattr_get_default(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, void *buffer, size_t size) { - return orangefs_inode_getxattr(dentry->d_inode, + return orangefs_inode_getxattr(inode, ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, name, buffer, @@ -507,12 +493,13 @@ static int orangefs_xattr_set_trusted(const struct xattr_handler *handler, } static int orangefs_xattr_get_trusted(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, void *buffer, size_t size) { - return orangefs_inode_getxattr(dentry->d_inode, + return orangefs_inode_getxattr(inode, ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, name, buffer, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index a4ff5d0d7db9..c7b31a03dc9c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -246,8 +246,8 @@ static bool ovl_need_xattr_filter(struct dentry *dentry, return false; } -ssize_t ovl_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size) +ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) { struct path realpath; enum ovl_path_type type = ovl_path_real(dentry, &realpath); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6a7090f4a441..99ec4b035237 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -173,8 +173,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_permission(struct inode *inode, int mask); int ovl_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -ssize_t ovl_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size); +ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 6ec1e43a9a54..da186ee4f846 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -218,7 +218,9 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); old_cred = override_creds(override_cred); - err = mutex_lock_killable(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); + err = 0; + // XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex); if (!err) { while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ef64984c9bbc..ed53ae0fe868 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -274,7 +274,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry) if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) return false; - res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); + res = inode->i_op->getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; @@ -295,6 +295,37 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) +{ + struct dentry *real; + + if (d_is_dir(dentry)) { + if (!inode || inode == d_inode(dentry)) + return dentry; + goto bug; + } + + real = ovl_dentry_upper(dentry); + if (real && (!inode || inode == d_inode(real))) + return real; + + real = ovl_dentry_lower(dentry); + if (!real) + goto bug; + + if (!inode || inode == d_inode(real)) + return real; + + /* Handle recursion */ + if (real->d_flags & DCACHE_OP_REAL) + return real->d_op->d_real(real, inode); + +bug: + WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry, + inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0); + return dentry; +} + static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe = dentry->d_fsdata; @@ -339,11 +370,13 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, .d_select_inode = ovl_d_select_inode, + .d_real = ovl_d_real, }; static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, .d_select_inode = ovl_d_select_inode, + .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, }; @@ -378,9 +411,7 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - inode_lock(dir->d_inode); - dentry = lookup_one_len(name->name, dir, name->len); - inode_unlock(dir->d_inode); + dentry = lookup_hash(name, dir); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) diff --git a/fs/pipe.c b/fs/pipe.c index ab8dad3ccb6a..0d3f5165cb0b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -134,7 +134,7 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, if (page_count(page) == 1 && !pipe->tmp_page) pipe->tmp_page = page; else - page_cache_release(page); + put_page(page); } /** @@ -180,7 +180,7 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); */ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - page_cache_get(buf->page); + get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); @@ -211,7 +211,7 @@ EXPORT_SYMBOL(generic_pipe_buf_confirm); void generic_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - page_cache_release(buf->page); + put_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_release); diff --git a/fs/pnode.c b/fs/pnode.c index c524fdddc7fb..99899705b105 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -198,7 +198,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) /* all accesses are serialized by namespace_sem */ static struct user_namespace *user_ns; -static struct mount *last_dest, *last_source, *dest_master; +static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; @@ -221,20 +221,22 @@ static int propagate_one(struct mount *m) type = CL_MAKE_SHARED; } else { struct mount *n, *p; + bool done; for (n = m; ; n = p) { p = n->mnt_master; - if (p == dest_master || IS_MNT_MARKED(p)) { - while (last_dest->mnt_master != p) { - last_source = last_source->mnt_master; - last_dest = last_source->mnt_parent; - } - if (!peers(n, last_dest)) { - last_source = last_source->mnt_master; - last_dest = last_source->mnt_parent; - } + if (p == dest_master || IS_MNT_MARKED(p)) break; - } } + do { + struct mount *parent = last_source->mnt_parent; + if (last_source == first_source) + break; + done = parent->mnt_master == p; + if (done && peers(n, parent)) + break; + last_source = last_source->mnt_master; + } while (!done); + type = CL_SLAVE; /* beginning of peer group among the slaves? */ if (IS_MNT_SHARED(m)) @@ -286,6 +288,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, */ user_ns = current->nsproxy->mnt_ns->user_ns; last_dest = dest_mnt; + first_source = source_mnt; last_source = source_mnt; mp = dest_mp; list = tree_list; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 711dd5170376..2c60f17e7d92 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -21,7 +21,7 @@ #include <linux/export.h> #include <linux/user_namespace.h> -struct posix_acl **acl_by_type(struct inode *inode, int type) +static struct posix_acl **acl_by_type(struct inode *inode, int type) { switch (type) { case ACL_TYPE_ACCESS: @@ -32,19 +32,22 @@ struct posix_acl **acl_by_type(struct inode *inode, int type) BUG(); } } -EXPORT_SYMBOL(acl_by_type); struct posix_acl *get_cached_acl(struct inode *inode, int type) { struct posix_acl **p = acl_by_type(inode, type); - struct posix_acl *acl = ACCESS_ONCE(*p); - if (acl) { - spin_lock(&inode->i_lock); - acl = *p; - if (acl != ACL_NOT_CACHED) - acl = posix_acl_dup(acl); - spin_unlock(&inode->i_lock); + struct posix_acl *acl; + + for (;;) { + rcu_read_lock(); + acl = rcu_dereference(*p); + if (!acl || is_uncached_acl(acl) || + atomic_inc_not_zero(&acl->a_refcount)) + break; + rcu_read_unlock(); + cpu_relax(); } + rcu_read_unlock(); return acl; } EXPORT_SYMBOL(get_cached_acl); @@ -59,58 +62,72 @@ void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) { struct posix_acl **p = acl_by_type(inode, type); struct posix_acl *old; - spin_lock(&inode->i_lock); - old = *p; - rcu_assign_pointer(*p, posix_acl_dup(acl)); - spin_unlock(&inode->i_lock); - if (old != ACL_NOT_CACHED) + + old = xchg(p, posix_acl_dup(acl)); + if (!is_uncached_acl(old)) posix_acl_release(old); } EXPORT_SYMBOL(set_cached_acl); -void forget_cached_acl(struct inode *inode, int type) +static void __forget_cached_acl(struct posix_acl **p) { - struct posix_acl **p = acl_by_type(inode, type); struct posix_acl *old; - spin_lock(&inode->i_lock); - old = *p; - *p = ACL_NOT_CACHED; - spin_unlock(&inode->i_lock); - if (old != ACL_NOT_CACHED) + + old = xchg(p, ACL_NOT_CACHED); + if (!is_uncached_acl(old)) posix_acl_release(old); } + +void forget_cached_acl(struct inode *inode, int type) +{ + __forget_cached_acl(acl_by_type(inode, type)); +} EXPORT_SYMBOL(forget_cached_acl); void forget_all_cached_acls(struct inode *inode) { - struct posix_acl *old_access, *old_default; - spin_lock(&inode->i_lock); - old_access = inode->i_acl; - old_default = inode->i_default_acl; - inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; - spin_unlock(&inode->i_lock); - if (old_access != ACL_NOT_CACHED) - posix_acl_release(old_access); - if (old_default != ACL_NOT_CACHED) - posix_acl_release(old_default); + __forget_cached_acl(&inode->i_acl); + __forget_cached_acl(&inode->i_default_acl); } EXPORT_SYMBOL(forget_all_cached_acls); struct posix_acl *get_acl(struct inode *inode, int type) { + void *sentinel; + struct posix_acl **p; struct posix_acl *acl; + /* + * The sentinel is used to detect when another operation like + * set_cached_acl() or forget_cached_acl() races with get_acl(). + * It is guaranteed that is_uncached_acl(sentinel) is true. + */ + acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) + if (!is_uncached_acl(acl)) return acl; if (!IS_POSIXACL(inode)) return NULL; + sentinel = uncached_acl_sentinel(current); + p = acl_by_type(inode, type); + /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. + * If the ACL isn't being read yet, set our sentinel. Otherwise, the + * current value of the ACL will not be ACL_NOT_CACHED and so our own + * sentinel will not be set; another task will update the cache. We + * could wait for that other task to complete its job, but it's easier + * to just call ->get_acl to fetch the ACL ourself. (This is going to + * be an unlikely race.) + */ + if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) + /* fall through */ ; + + /* + * Normally, the ACL returned by ->get_acl will be cached. + * A filesystem can prevent that by calling + * forget_cached_acl(inode, type) in ->get_acl. * * If the filesystem doesn't have a get_acl() function at all, we'll * just create the negative cache entry. @@ -119,7 +136,24 @@ struct posix_acl *get_acl(struct inode *inode, int type) set_cached_acl(inode, type, NULL); return NULL; } - return inode->i_op->get_acl(inode, type); + acl = inode->i_op->get_acl(inode, type); + + if (IS_ERR(acl)) { + /* + * Remove our sentinel so that we don't block future attempts + * to cache the ACL. + */ + cmpxchg(p, sentinel, ACL_NOT_CACHED); + return acl; + } + + /* + * Cache the result, but only if our sentinel is still in place. + */ + posix_acl_dup(acl); + if (unlikely(cmpxchg(p, sentinel, acl) != sentinel)) + posix_acl_release(acl); + return acl; } EXPORT_SYMBOL(get_acl); @@ -763,18 +797,18 @@ EXPORT_SYMBOL (posix_acl_to_xattr); static int posix_acl_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *value, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) { struct posix_acl *acl; int error; - if (!IS_POSIXACL(d_backing_inode(dentry))) + if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; - if (d_is_symlink(dentry)) + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - acl = get_acl(d_backing_inode(dentry), handler->flags); + acl = get_acl(inode, handler->flags); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) diff --git a/fs/proc/array.c b/fs/proc/array.c index b6c00ce0e29e..88c7de12197b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -83,6 +83,7 @@ #include <linux/tracehook.h> #include <linux/string_helpers.h> #include <linux/user_namespace.h> +#include <linux/fs_struct.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -139,12 +140,25 @@ static inline const char *get_task_state(struct task_struct *tsk) return task_state_array[fls(state)]; } +static inline int get_task_umask(struct task_struct *tsk) +{ + struct fs_struct *fs; + int umask = -ENOENT; + + task_lock(tsk); + fs = tsk->fs; + if (fs) + umask = fs->umask; + task_unlock(tsk); + return umask; +} + static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g; + int g, umask; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; @@ -162,6 +176,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, ngid = task_numa_group_id(p); cred = get_task_cred(p); + umask = get_task_umask(p); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); + task_lock(p); if (p->files) max_fds = files_fdtable(p->files)->max_fds; diff --git a/fs/proc/base.c b/fs/proc/base.c index b1755b23893e..a11eb7196ec8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -434,7 +434,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); else - seq_puts(m, "0\n"); + seq_putc(m, '0'); return 0; } @@ -955,7 +955,8 @@ static ssize_t environ_read(struct file *file, char __user *buf, struct mm_struct *mm = file->private_data; unsigned long env_start, env_end; - if (!mm) + /* Ensure the process spawned far enough to have an environment. */ + if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_TEMPORARY); @@ -1819,12 +1820,17 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_hash_and_lookup(dir, &qname); if (!child) { - child = d_alloc(dir, &qname); - if (!child) - goto end_instantiate; - if (instantiate(d_inode(dir), child, task, ptr) < 0) { - dput(child); + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) goto end_instantiate; + if (d_in_lookup(child)) { + int err = instantiate(d_inode(dir), child, task, ptr); + d_lookup_done(child); + if (err < 0) { + dput(child); + goto end_instantiate; + } } } inode = d_inode(child); @@ -2154,8 +2160,8 @@ out: static const struct file_operations proc_map_files_operations = { .read = generic_read_dir, - .iterate = proc_map_files_readdir, - .llseek = default_llseek, + .iterate_shared = proc_map_files_readdir, + .llseek = generic_file_llseek, }; #ifdef CONFIG_CHECKPOINT_RESTORE @@ -2502,8 +2508,8 @@ static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, - .iterate = proc_attr_dir_readdir, - .llseek = default_llseek, + .iterate_shared = proc_attr_dir_readdir, + .llseek = generic_file_llseek, }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, @@ -2910,8 +2916,8 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, - .iterate = proc_tgid_base_readdir, - .llseek = default_llseek, + .iterate_shared = proc_tgid_base_readdir, + .llseek = generic_file_llseek, }; static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -3157,6 +3163,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) } /* + * proc_tid_comm_permission is a special permission function exclusively + * used for the node /proc/<pid>/task/<tid>/comm. + * It bypasses generic permission checks in the case where a task of the same + * task group attempts to access the node. + * The rationale behind this is that glibc and bionic access this node for + * cross thread naming (pthread_set/getname_np(!self)). However, if + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, + * which locks out the cross thread naming implementation. + * This function makes sure that the node is always accessible for members of + * same thread group. + */ +static int proc_tid_comm_permission(struct inode *inode, int mask) +{ + bool is_same_tgroup; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + is_same_tgroup = same_thread_group(current, task); + put_task_struct(task); + + if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { + /* This file (/proc/<pid>/task/<tid>/comm) can always be + * read or written by the members of the corresponding + * thread group. + */ + return 0; + } + + return generic_permission(inode, mask); +} + +static const struct inode_operations proc_tid_comm_inode_operations = { + .permission = proc_tid_comm_permission, +}; + +/* * Tasks */ static const struct pid_entry tid_base_stuff[] = { @@ -3174,7 +3218,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif - REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), + NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, + &proc_tid_comm_inode_operations, + &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif @@ -3258,8 +3304,8 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, - .iterate = proc_tid_base_readdir, - .llseek = default_llseek, + .iterate_shared = proc_tid_base_readdir, + .llseek = generic_file_llseek, }; static const struct inode_operations proc_tid_base_inode_operations = { @@ -3469,6 +3515,6 @@ static const struct inode_operations proc_task_inode_operations = { static const struct file_operations proc_task_operations = { .read = generic_read_dir, - .iterate = proc_task_readdir, - .llseek = default_llseek, + .iterate_shared = proc_task_readdir, + .llseek = generic_file_llseek, }; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 56afa5ef08f2..01df23cc81f6 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -276,8 +276,8 @@ static int proc_readfd(struct file *file, struct dir_context *ctx) const struct file_operations proc_fd_operations = { .read = generic_read_dir, - .iterate = proc_readfd, - .llseek = default_llseek, + .iterate_shared = proc_readfd, + .llseek = generic_file_llseek, }; static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, @@ -361,6 +361,6 @@ const struct inode_operations proc_fdinfo_inode_operations = { const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, - .iterate = proc_readfdinfo, - .llseek = default_llseek, + .iterate_shared = proc_readfdinfo, + .llseek = generic_file_llseek, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index ff3ffc76a937..c633476616e0 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -318,7 +318,7 @@ int proc_readdir(struct file *file, struct dir_context *ctx) static const struct file_operations proc_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = proc_readdir, + .iterate_shared = proc_readdir, }; /* diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 72cb26f85d58..51b8b0a8ad91 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -139,7 +139,8 @@ out: const struct file_operations proc_ns_dir_operations = { .read = generic_read_dir, - .iterate = proc_ns_dir_readdir, + .iterate_shared = proc_ns_dir_readdir, + .llseek = generic_file_llseek, }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, diff --git a/fs/proc/page.c b/fs/proc/page.c index 712f1b9992cc..3ecd445e830d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page) /* - * Caveats on high order pages: page->_count will only be set + * Caveats on high order pages: page->_refcount will only be set * -1 on the head page; SLUB/SLQB do the same for PG_slab; * SLOB won't set PG_slab at all on compound pages. */ diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 350984a19c83..c8bbc68cdb05 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -179,7 +179,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = proc_tgid_net_readdir, + .iterate_shared = proc_tgid_net_readdir, }; static __net_init int proc_net_ns_init(struct net *net) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fe5b6e6c4671..5e57c3e46e1d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -627,18 +627,19 @@ static bool proc_sys_fill_cache(struct file *file, child = d_lookup(dir, &qname); if (!child) { - child = d_alloc(dir, &qname); - if (child) { + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + return false; + if (d_in_lookup(child)) { inode = proc_sys_make_inode(dir->d_sb, head, table); if (!inode) { + d_lookup_done(child); dput(child); return false; - } else { - d_set_d_op(child, &proc_sys_dentry_operations); - d_add(child, inode); } - } else { - return false; + d_set_d_op(child, &proc_sys_dentry_operations); + d_add(child, inode); } } inode = d_inode(child); @@ -789,7 +790,7 @@ static const struct file_operations proc_sys_file_operations = { static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, - .iterate = proc_sys_readdir, + .iterate_shared = proc_sys_readdir, .llseek = generic_file_llseek, }; diff --git a/fs/proc/root.c b/fs/proc/root.c index 361ab4ee42fc..55bc7d6c8aac 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -226,8 +226,8 @@ static int proc_root_readdir(struct file *file, struct dir_context *ctx) */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, - .iterate = proc_root_readdir, - .llseek = default_llseek, + .iterate_shared = proc_root_readdir, + .llseek = generic_file_llseek, }; /* diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9df431642042..4648c7f63ae2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -553,7 +553,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (radix_tree_exceptional_entry(page)) mss->swap += PAGE_SIZE; else - page_cache_release(page); + put_page(page); return; } @@ -1027,11 +1027,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, }; if (type == CLEAR_REFS_MM_HIWATER_RSS) { + if (down_write_killable(&mm->mmap_sem)) { + count = -EINTR; + goto out_mm; + } + /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ - down_write(&mm->mmap_sem); reset_mm_hiwater_rss(mm); up_write(&mm->mmap_sem); goto out_mm; @@ -1043,7 +1047,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) { + count = -EINTR; + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); @@ -1518,6 +1525,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return page; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static struct page *can_gather_numa_stats_pmd(pmd_t pmd, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pmd_present(pmd)) + return NULL; + + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} +#endif + static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -1527,14 +1560,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - pte_t huge_pte = *(pte_t *)pmd; struct page *page; - page = can_gather_numa_stats(huge_pte, vma, addr); + page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) - gather_stats(page, md, pte_dirty(huge_pte), + gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; @@ -1542,6 +1575,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; +#endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, vma, addr); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 55bb57e6a30d..8ab782d8b33d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -279,12 +279,12 @@ static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!page) return VM_FAULT_OOM; if (!PageUptodate(page)) { - offset = (loff_t) index << PAGE_CACHE_SHIFT; + offset = (loff_t) index << PAGE_SHIFT; buf = __va((page_to_pfn(page) << PAGE_SHIFT)); rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); if (rc < 0) { unlock_page(page); - page_cache_release(page); + put_page(page); return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; } SetPageUptodate(page); @@ -1071,7 +1071,7 @@ static int __init parse_crash_elf32_headers(void) /* Do some basic Verification. */ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || (ehdr.e_type != ET_CORE) || - !elf_check_arch(&ehdr) || + !vmcore_elf32_check_arch(&ehdr) || ehdr.e_ident[EI_CLASS] != ELFCLASS32|| ehdr.e_ident[EI_VERSION] != EV_CURRENT || ehdr.e_version != EV_CURRENT || diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index dc645b66cd79..45d6110744cb 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -420,8 +420,8 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) pstore_sb = sb; sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = PSTOREFS_MAGIC; sb->s_op = &pstore_ops; sb->s_time_gran = 1; diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index b218f965817b..781056a0480f 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -71,7 +71,7 @@ const struct file_operations qnx4_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = qnx4_readdir, + .iterate_shared = qnx4_readdir, .fsync = generic_file_fsync, }; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index e1f37278cf97..27637e0bdc9f 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -35,9 +35,9 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n) static unsigned last_entry(struct inode *inode, unsigned long page_nr) { unsigned long last_byte = inode->i_size; - last_byte -= page_nr << PAGE_CACHE_SHIFT; - if (last_byte > PAGE_CACHE_SIZE) - last_byte = PAGE_CACHE_SIZE; + last_byte -= page_nr << PAGE_SHIFT; + if (last_byte > PAGE_SIZE) + last_byte = PAGE_SIZE; return last_byte / QNX6_DIR_ENTRY_SIZE; } @@ -47,9 +47,9 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, { struct qnx6_sb_info *sbi = QNX6_SB(sb); u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */ - u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */ + u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */ /* within page */ - u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK; + u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK; struct address_space *mapping = sbi->longfile->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); if (IS_ERR(page)) @@ -115,8 +115,8 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) struct qnx6_sb_info *sbi = QNX6_SB(s); loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); - unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; + unsigned long n = pos >> PAGE_SHIFT; + unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE; bool done = false; ctx->pos = pos; @@ -131,7 +131,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) if (IS_ERR(page)) { pr_err("%s(): read failed\n", __func__); - ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; + ctx->pos = (n + 1) << PAGE_SHIFT; return PTR_ERR(page); } de = ((struct qnx6_dir_entry *)page_address(page)) + start; @@ -272,7 +272,7 @@ found: const struct file_operations qnx6_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = qnx6_readdir, + .iterate_shared = qnx6_readdir, .fsync = generic_file_fsync, }; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 47bb1de07155..1192422a1c56 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -542,8 +542,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) iget_failed(inode); return ERR_PTR(-EIO); } - n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS); - offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS); + n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS); + offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS); mapping = sbi->inodes->i_mapping; page = read_mapping_page(mapping, n, NULL); if (IS_ERR(page)) { diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index d3fb2b698800..f23b5c4a66ad 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -128,7 +128,7 @@ extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, static inline void qnx6_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ba827daea5a0..ff21980d0119 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2047,11 +2047,20 @@ int dquot_get_next_id(struct super_block *sb, struct kqid *qid) struct quota_info *dqopt = sb_dqopt(sb); int err; - if (!dqopt->ops[qid->type]->get_next_id) - return -ENOSYS; + mutex_lock(&dqopt->dqonoff_mutex); + if (!sb_has_quota_active(sb, qid->type)) { + err = -ESRCH; + goto out; + } + if (!dqopt->ops[qid->type]->get_next_id) { + err = -ENOSYS; + goto out; + } mutex_lock(&dqopt->dqio_mutex); err = dqopt->ops[qid->type]->get_next_id(sb, qid); mutex_unlock(&dqopt->dqio_mutex); +out: + mutex_unlock(&dqopt->dqonoff_mutex); return err; } diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index d07a2f91d858..8b252673d454 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -47,7 +47,7 @@ void quota_send_warning(struct kqid qid, dev_t dev, void *msg_head; int ret; int msg_size = 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u64)); + 2 * nla_total_size_64bit(sizeof(u64)); /* We have to allocate using GFP_NOFS as we are called from a * filesystem performing write and thus further recursion into @@ -68,8 +68,9 @@ void quota_send_warning(struct kqid qid, dev_t dev, ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type); if (ret) goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, - from_kqid_munged(&init_user_ns, qid)); + ret = nla_put_u64_64bit(skb, QUOTA_NL_A_EXCESS_ID, + from_kqid_munged(&init_user_ns, qid), + QUOTA_NL_A_PAD); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); @@ -81,8 +82,9 @@ void quota_send_warning(struct kqid qid, dev_t dev, ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); if (ret) goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, - from_kuid_munged(&init_user_ns, current_uid())); + ret = nla_put_u64_64bit(skb, QUOTA_NL_A_CAUSED_ID, + from_kuid_munged(&init_user_ns, current_uid()), + QUOTA_NL_A_PAD); if (ret) goto attr_err_out; genlmsg_end(skb, msg_head); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index a586467f6ff6..be3ddd189cd4 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -211,14 +211,11 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, struct page **pages = NULL, **ptr, *page; loff_t isize; - if (!(flags & MAP_SHARED)) - return addr; - /* the mapping mustn't extend beyond the EOF */ lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; isize = i_size_read(inode); - ret = -EINVAL; + ret = -ENOSYS; maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= maxpages) goto out; @@ -227,7 +224,6 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, goto out; /* gang-find the pages */ - ret = -ENOMEM; pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto out_free; @@ -263,7 +259,7 @@ out: */ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) return -ENOSYS; file_accessed(file); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 38981b037524..1ab6e6c2e60e 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -223,8 +223,8 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent) return err; sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RAMFS_MAGIC; sb->s_op = &ramfs_ops; sb->s_time_gran = 1; diff --git a/fs/read_write.c b/fs/read_write.c index cf377cf9dfe3..933b53a375b4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -302,18 +302,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); -static inline struct fd fdget_pos(int fd) -{ - return __to_fd(__fdget_pos(fd)); -} - -static inline void fdput_pos(struct fd f) -{ - if (f.flags & FDPUT_POS_UNLOCK) - mutex_unlock(&f.file->f_pos_lock); - fdput(f); -} - SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; @@ -410,11 +398,6 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) } EXPORT_SYMBOL(vfs_iter_write); -/* - * rw_verify_area doesn't like huge counts. We limit - * them to something that fits in "int" so that others - * won't have to do range checks all the time. - */ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { struct inode *inode; @@ -441,11 +424,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t if (retval < 0) return retval; } - retval = security_file_permission(file, + return security_file_permission(file, read_write == READ ? MAY_READ : MAY_WRITE); - if (retval) - return retval; - return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) @@ -489,8 +469,9 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); - if (ret >= 0) { - count = ret; + if (!ret) { + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; ret = __vfs_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file); @@ -572,8 +553,9 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); - if (ret >= 0) { - count = ret; + if (!ret) { + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; file_start_write(file); ret = __vfs_write(file, buf, count, pos); if (ret > 0) { @@ -698,12 +680,16 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags & ~RWF_HIPRI) + if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) return -EOPNOTSUPP; init_sync_kiocb(&kiocb, filp); if (flags & RWF_HIPRI) kiocb.ki_flags |= IOCB_HIPRI; + if (flags & RWF_DSYNC) + kiocb.ki_flags |= IOCB_DSYNC; + if (flags & RWF_SYNC) + kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); kiocb.ki_pos = *ppos; ret = fn(&kiocb, iter); @@ -1323,7 +1309,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; - count = retval; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. @@ -1341,7 +1328,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; - count = retval; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); @@ -1485,11 +1471,12 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (flags != 0) return -EINVAL; - /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */ ret = rw_verify_area(READ, file_in, &pos_in, len); - if (ret >= 0) - ret = rw_verify_area(WRITE, file_out, &pos_out, len); - if (ret < 0) + if (unlikely(ret)) + return ret; + + ret = rw_verify_area(WRITE, file_out, &pos_out, len); + if (unlikely(ret)) return ret; if (!(file_in->f_mode & FMODE_READ) || diff --git a/fs/readdir.c b/fs/readdir.c index e69ef3b79787..68ef06efe6bc 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -24,27 +24,40 @@ int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + bool shared = false; int res = -ENOTDIR; - if (!file->f_op->iterate) + if (file->f_op->iterate_shared) + shared = true; + else if (!file->f_op->iterate) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; - res = mutex_lock_killable(&inode->i_mutex); - if (res) - goto out; + if (shared) + inode_lock_shared(inode); + else + inode_lock(inode); + // res = mutex_lock_killable(&inode->i_mutex); + // if (res) + // goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; - res = file->f_op->iterate(file, ctx); + if (shared) + res = file->f_op->iterate_shared(file, ctx); + else + res = file->f_op->iterate(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } - inode_unlock(inode); + if (shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); out: return res; } @@ -111,7 +124,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, .dirent = dirent @@ -124,7 +137,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, if (buf.result) error = buf.result; - fdput(f); + fdput_pos(f); return error; } @@ -169,6 +182,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, } dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } @@ -208,7 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - f = fdget(fd); + f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -222,7 +237,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, else error = count - buf.count; } - fdput(f); + fdput_pos(f); return error; } @@ -248,6 +263,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EINVAL; dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } @@ -289,7 +306,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - f = fdget(fd); + f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -304,6 +321,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, else error = count - buf.count; } - fdput(f); + fdput_pos(f); return error; } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 3abd4004184b..45aa05e2232f 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -20,7 +20,7 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, const struct file_operations reiserfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = reiserfs_readdir, + .iterate_shared = reiserfs_readdir, .fsync = reiserfs_dir_fsync, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 9424a4ba93a9..90f815bdfa8a 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -180,11 +180,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, int partial = 0; unsigned blocksize; struct buffer_head *bh, *head; - unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; + unsigned long i_size_index = inode->i_size >> PAGE_SHIFT; int new; int logit = reiserfs_file_data_log(inode); struct super_block *s = inode->i_sb; - int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + int bh_per_page = PAGE_SIZE / s->s_blocksize; struct reiserfs_transaction_handle th; int ret = 0; @@ -260,10 +260,10 @@ const struct file_operations reiserfs_file_operations = { const struct inode_operations reiserfs_file_inode_operations = { .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ae9e5b308cf9..825455d3e4ba 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -386,7 +386,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, goto finished; } /* read file tail into part of page */ - offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1); copy_item_head(&tmp_ih, ih); /* @@ -587,10 +587,10 @@ static int convert_tail_for_hole(struct inode *inode, return -EIO; /* always try to read until the end of the block */ - tail_start = tail_offset & (PAGE_CACHE_SIZE - 1); + tail_start = tail_offset & (PAGE_SIZE - 1); tail_end = (tail_start | (bh_result->b_size - 1)) + 1; - index = tail_offset >> PAGE_CACHE_SHIFT; + index = tail_offset >> PAGE_SHIFT; /* * hole_page can be zero in case of direct_io, we are sure * that we cannot get here if we write with O_DIRECT into tail page @@ -629,7 +629,7 @@ static int convert_tail_for_hole(struct inode *inode, unlock: if (tail_page != hole_page) { unlock_page(tail_page); - page_cache_release(tail_page); + put_page(tail_page); } out: return retval; @@ -2189,11 +2189,11 @@ static int grab_tail_page(struct inode *inode, * we want the page with the last byte in the file, * not the page that will hold the next byte for appending */ - unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT; unsigned long pos = 0; unsigned long start = 0; unsigned long blocksize = inode->i_sb->s_blocksize; - unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1); + unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1); struct buffer_head *bh; struct buffer_head *head; struct page *page; @@ -2251,7 +2251,7 @@ out: unlock: unlock_page(page); - page_cache_release(page); + put_page(page); return error; } @@ -2265,7 +2265,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) { struct reiserfs_transaction_handle th; /* we want the offset for the first byte after the end of the file */ - unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + unsigned long offset = inode->i_size & (PAGE_SIZE - 1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned length; struct page *page = NULL; @@ -2345,7 +2345,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) } } unlock_page(page); - page_cache_release(page); + put_page(page); } reiserfs_write_unlock(inode->i_sb); @@ -2354,7 +2354,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) out: if (page) { unlock_page(page); - page_cache_release(page); + put_page(page); } reiserfs_write_unlock(inode->i_sb); @@ -2426,7 +2426,7 @@ research: } else if (is_direct_le_ih(ih)) { char *p; p = page_address(bh_result->b_page); - p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1); + p += (byte_offset - 1) & (PAGE_SIZE - 1); copy_size = ih_item_len(ih) - pos_in_item; fs_gen = get_generation(inode->i_sb); @@ -2525,7 +2525,7 @@ static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + unsigned long end_index = inode->i_size >> PAGE_SHIFT; int error = 0; unsigned long block; sector_t last_block; @@ -2535,7 +2535,7 @@ static int reiserfs_write_full_page(struct page *page, int checked = PageChecked(page); struct reiserfs_transaction_handle th; struct super_block *s = inode->i_sb; - int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + int bh_per_page = PAGE_SIZE / s->s_blocksize; th.t_trans_id = 0; /* no logging allowed when nonblocking or from PF_MEMALLOC */ @@ -2564,16 +2564,16 @@ static int reiserfs_write_full_page(struct page *page, if (page->index >= end_index) { unsigned last_offset; - last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + last_offset = inode->i_size & (PAGE_SIZE - 1); /* no file contents in this page */ if (page->index >= end_index + 1 || !last_offset) { unlock_page(page); return 0; } - zero_user_segment(page, last_offset, PAGE_CACHE_SIZE); + zero_user_segment(page, last_offset, PAGE_SIZE); } bh = head; - block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); + block = page->index << (PAGE_SHIFT - s->s_blocksize_bits); last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; /* first map all the buffers, logging any direct items we find */ do { @@ -2774,7 +2774,7 @@ static int reiserfs_write_begin(struct file *file, *fsdata = (void *)(unsigned long)flags; } - index = pos >> PAGE_CACHE_SHIFT; + index = pos >> PAGE_SHIFT; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -2822,7 +2822,7 @@ static int reiserfs_write_begin(struct file *file, } if (ret) { unlock_page(page); - page_cache_release(page); + put_page(page); /* Truncate allocated blocks */ reiserfs_truncate_failed_write(inode); } @@ -2909,7 +2909,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, else th = NULL; - start = pos & (PAGE_CACHE_SIZE - 1); + start = pos & (PAGE_SIZE - 1); if (unlikely(copied < len)) { if (!PageUptodate(page)) copied = 0; @@ -2974,7 +2974,7 @@ out: if (locked) reiserfs_write_unlock(inode->i_sb); unlock_page(page); - page_cache_release(page); + put_page(page); if (pos + len > inode->i_size) reiserfs_truncate_failed_write(inode); @@ -2996,7 +2996,7 @@ int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to) { struct inode *inode = page->mapping->host; - loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to; + loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to; int ret = 0; int update_sd = 0; struct reiserfs_transaction_handle *th = NULL; @@ -3181,7 +3181,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset, struct inode *inode = page->mapping->host; unsigned int curr_off = 0; unsigned int stop = offset + length; - int partial_page = (offset || length < PAGE_CACHE_SIZE); + int partial_page = (offset || length < PAGE_SIZE); int ret = 1; BUG_ON(!PageLocked(page)); @@ -3279,15 +3279,14 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) * We thank Mingming Cao for helping us understand in great detail what * to do in this section of the code. */ -static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, + ret = blockdev_direct_IO(iocb, inode, iter, reiserfs_get_blocks_direct_io); /* @@ -3296,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { truncate_setsize(inode, isize); diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 036a1fc0a8c3..2f1ddc908013 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -187,7 +187,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) } /* we need to make sure nobody is changing the file size beneath us */ - reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); +{ + int depth = reiserfs_write_unlock_nested(inode->i_sb); + inode_lock(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); +} reiserfs_write_lock(inode->i_sb); @@ -203,7 +207,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) * __reiserfs_write_begin on that page. This will force a * reiserfs_get_block to unpack the tail for us. */ - index = inode->i_size >> PAGE_CACHE_SHIFT; + index = inode->i_size >> PAGE_SHIFT; mapping = inode->i_mapping; page = grab_cache_page(mapping, index); retval = -ENOMEM; @@ -221,7 +225,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) out_unlock: unlock_page(page); - page_cache_release(page); + put_page(page); out: inode_unlock(inode); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 44c2bdced1c8..2ace90e981f0 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -599,18 +599,18 @@ static int journal_list_still_alive(struct super_block *s, * This does a check to see if the buffer belongs to one of these * lost pages before doing the final put_bh. If page->mapping was * null, it tries to free buffers on the page, which should make the - * final page_cache_release drop the page from the lru. + * final put_page drop the page from the lru. */ static void release_buffer_page(struct buffer_head *bh) { struct page *page = bh->b_page; if (!page->mapping && trylock_page(page)) { - page_cache_get(page); + get_page(page); put_bh(bh); if (!page->mapping) try_to_free_buffers(page); unlock_page(page); - page_cache_release(page); + put_page(page); } else { put_bh(bh); } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 2a12d46d7fb4..8a36696d6df9 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1650,10 +1650,10 @@ const struct inode_operations reiserfs_dir_inode_operations = { .mknod = reiserfs_mknod, .rename = reiserfs_rename, .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, @@ -1667,10 +1667,10 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .readlink = generic_readlink, .get_link = page_get_link, .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, }; @@ -1679,10 +1679,10 @@ const struct inode_operations reiserfs_symlink_inode_operations = { */ const struct inode_operations reiserfs_special_inode_operations = { .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 99a5d5dae46a..415d66ca87d1 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -3,8 +3,8 @@ */ #include <linux/string.h> -#include <linux/random.h> #include <linux/time.h> +#include <linux/uuid.h> #include "reiserfs.h" /* find where objectid map starts */ diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 24cbe013240f..5feacd689241 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1342,7 +1342,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, */ data = kmap_atomic(un_bh->b_page); - off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); + off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1)); memcpy(data + off, ih_item_body(PATH_PLAST_BUFFER(path), &s_ih), ret_value); @@ -1511,7 +1511,7 @@ static void unmap_buffers(struct page *page, loff_t pos) if (page) { if (page_has_buffers(page)) { - tail_index = pos & (PAGE_CACHE_SIZE - 1); + tail_index = pos & (PAGE_SIZE - 1); cur_index = 0; head = page_buffers(page); bh = head; diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index f41e19b4bb42..2d5489b0a269 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -151,7 +151,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, */ if (up_to_date_bh) { unsigned pgoff = - (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); + (tail_offset + total_tail - 1) & (PAGE_SIZE - 1); char *kaddr = kmap_atomic(up_to_date_bh->b_page); memset(kaddr + pgoff, 0, blk_size - total_tail); kunmap_atomic(kaddr); @@ -271,7 +271,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, * the page was locked and this part of the page was up to date when * indirect2direct was called, so we know the bytes are still valid */ - tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); + tail = tail + (pos & (PAGE_SIZE - 1)); PATH_LAST_POSITION(path)++; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 57e0b2310532..a33812ae9fad 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -415,7 +415,7 @@ out: static inline void reiserfs_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } static struct page *reiserfs_get_page(struct inode *dir, size_t n) @@ -427,7 +427,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n) * and an unlink/rmdir has just occurred - GFP_NOFS avoids this */ mapping_set_gfp_mask(mapping, GFP_NOFS); - page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL); + page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL); if (!IS_ERR(page)) { kmap(page); if (PageError(page)) @@ -526,10 +526,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, while (buffer_pos < buffer_size || buffer_pos == 0) { size_t chunk; size_t skip = 0; - size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + size_t page_offset = (file_pos & (PAGE_SIZE - 1)); - if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) - chunk = PAGE_CACHE_SIZE; + if (buffer_size - buffer_pos > PAGE_SIZE) + chunk = PAGE_SIZE; else chunk = buffer_size - buffer_pos; @@ -546,8 +546,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, struct reiserfs_xattr_header *rxh; skip = file_pos = sizeof(struct reiserfs_xattr_header); - if (chunk + skip > PAGE_CACHE_SIZE) - chunk = PAGE_CACHE_SIZE - skip; + if (chunk + skip > PAGE_SIZE) + chunk = PAGE_SIZE - skip; rxh = (struct reiserfs_xattr_header *)data; rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC); rxh->h_hash = cpu_to_le32(xahash); @@ -675,8 +675,8 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, char *data; size_t skip = 0; - if (isize - file_pos > PAGE_CACHE_SIZE) - chunk = PAGE_CACHE_SIZE; + if (isize - file_pos > PAGE_SIZE) + chunk = PAGE_SIZE; else chunk = isize - file_pos; @@ -764,60 +764,6 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers, return xah; } - -/* - * Inode operation getxattr() - */ -ssize_t -reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, - size_t size) -{ - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - - if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) - return -EOPNOTSUPP; - - return handler->get(handler, dentry, name, buffer, size); -} - -/* - * Inode operation setxattr() - * - * d_inode(dentry)->i_mutex down - */ -int -reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - - if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) - return -EOPNOTSUPP; - - return handler->set(handler, dentry, name, value, size, flags); -} - -/* - * Inode operation removexattr() - * - * d_inode(dentry)->i_mutex down - */ -int reiserfs_removexattr(struct dentry *dentry, const char *name) -{ - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - - if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) - return -EOPNOTSUPP; - - return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); -} - struct listxattr_buf { struct dir_context ctx; size_t size; diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index 15dde6262c00..613ff5aef94e 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -2,6 +2,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/rwsem.h> +#include <linux/xattr.h> struct inode; struct dentry; @@ -18,12 +19,7 @@ int reiserfs_permission(struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size); -int reiserfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -int reiserfs_removexattr(struct dentry *dentry, const char *name); int reiserfs_xattr_get(struct inode *, const char *, void *, size_t); int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int); @@ -92,10 +88,7 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode) #else -#define reiserfs_getxattr NULL -#define reiserfs_setxattr NULL #define reiserfs_listxattr NULL -#define reiserfs_removexattr NULL static inline void reiserfs_init_xattr_rwsem(struct inode *inode) { diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 558a16beaacb..dbed42f755e0 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -197,10 +197,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) size = reiserfs_xattr_get(inode, name, NULL, 0); if (size < 0) { - if (size == -ENODATA || size == -ENOSYS) { - set_cached_acl(inode, type, NULL); + if (size == -ENODATA || size == -ENOSYS) return NULL; - } return ERR_PTR(size); } @@ -220,8 +218,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) } else { acl = reiserfs_posix_acl_from_disk(value, retval); } - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); kfree(value); return acl; @@ -370,7 +366,7 @@ int reiserfs_cache_default_acl(struct inode *inode) if (IS_PRIVATE(inode)) return 0; - acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_acl(inode, ACL_TYPE_DEFAULT); if (acl && !IS_ERR(acl)) { int size = reiserfs_acl_size(acl->a_count); diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index ab0217d32039..86aeb9dd805a 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -9,29 +9,26 @@ #include <linux/uaccess.h> static int -security_get(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, void *buffer, size_t size) +security_get(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, void *buffer, size_t size) { - if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) - return -EINVAL; - - if (IS_PRIVATE(d_inode(dentry))) + if (IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); + return reiserfs_xattr_get(inode, xattr_full_name(handler, name), + buffer, size); } static int security_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) - return -EINVAL; - if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size, flags); } static bool security_list(struct dentry *dentry) diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 64b67aa643a9..31837f031f59 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -8,29 +8,26 @@ #include <linux/uaccess.h> static int -trusted_get(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, void *buffer, size_t size) +trusted_get(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, void *buffer, size_t size) { - if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); + return reiserfs_xattr_get(inode, xattr_full_name(handler, name), + buffer, size); } static int trusted_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size, flags); } static bool trusted_list(struct dentry *dentry) diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 12e6306f562a..f7c39731684b 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -7,27 +7,24 @@ #include <linux/uaccess.h> static int -user_get(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, void *buffer, size_t size) +user_get(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, void *buffer, size_t size) { - - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) - return -EINVAL; - if (!reiserfs_xattrs_user(dentry->d_sb)) + if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); + return reiserfs_xattr_get(inode, xattr_full_name(handler, name), + buffer, size); } static int user_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) - return -EINVAL; - if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size, flags); } static bool user_list(struct dentry *dentry) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 6b00ca357c58..d0f8a38dfafa 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -280,8 +280,8 @@ error: static const struct file_operations romfs_dir_operations = { .read = generic_read_dir, - .iterate = romfs_readdir, - .llseek = default_llseek, + .iterate_shared = romfs_readdir, + .llseek = generic_file_llseek, }; static const struct inode_operations romfs_dir_inode_operations = { diff --git a/fs/select.c b/fs/select.c index 869293988c2a..8ed9da50896a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -47,7 +47,7 @@ #define MAX_SLACK (100 * NSEC_PER_MSEC) -static long __estimate_accuracy(struct timespec *tv) +static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; @@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -u64 select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; - struct timespec now; + struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. @@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv) if (rt_task(current)) return 0; - ktime_get_ts(&now); - now = timespec_sub(*tv, now); + ktime_get_ts64(&now); + now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; @@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value - * @to: pointer to timespec variable for the final timeout + * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * @@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout); * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ -int poll_select_set_timeout(struct timespec *to, long sec, long nsec) +int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { - struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; - if (!timespec_valid(&ts)) + if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { - ktime_get_ts(to); - *to = timespec_add_safe(*to, ts); + ktime_get_ts64(to); + *to = timespec64_add_safe(*to, ts); } return 0; } -static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, +static int poll_select_copy_remaining(struct timespec64 *end_time, + void __user *p, int timeval, int ret) { + struct timespec64 rts64; struct timespec rts; struct timeval rtv; @@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts(&rts); - rts = timespec_sub(*end_time, rts); - if (rts.tv_sec < 0) - rts.tv_sec = rts.tv_nsec = 0; + ktime_get_ts64(&rts64); + rts64 = timespec64_sub(*end_time, rts64); + if (rts64.tv_sec < 0) + rts64.tv_sec = rts64.tv_nsec = 0; + + rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts.tv_sec; - rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + rtv.tv_sec = rts64.tv_sec; + rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; @@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -int do_select(int n, fd_set_bits *fds, struct timespec *end_time) +int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec *end_time) + fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; @@ -622,7 +626,7 @@ out_nofds: SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; struct timeval tv; int ret; @@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 ts64, end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; + ts64 = timespec_to_timespec64(ts); to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) return -EINVAL; } @@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, - struct timespec *end_time) + struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; @@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, sizeof(struct pollfd)) int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec *end_time) + struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; @@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec *to = NULL, end_time; + struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { @@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block) SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { @@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 end_time, *to = NULL; int ret; if (tsp) { diff --git a/fs/seq_file.c b/fs/seq_file.c index e85664b7c7d9..19f532e7d35e 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -72,9 +72,10 @@ int seq_open(struct file *file, const struct seq_operations *op) mutex_init(&p->lock); p->op = op; -#ifdef CONFIG_USER_NS - p->user_ns = file->f_cred->user_ns; -#endif + + // No refcounting: the lifetime of 'p' is constrained + // to the lifetime of the file. + p->file = file; /* * Wrappers around seq_open(e.g. swaps_open) need to be diff --git a/fs/splice.c b/fs/splice.c index 9947b5c69664..dd9bf7e410d2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -88,7 +88,7 @@ out_unlock: static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - page_cache_release(buf->page); + put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } @@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(splice_to_pipe); void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { - page_cache_release(spd->pages[i]); + put_page(spd->pages[i]); } /* @@ -328,9 +328,9 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, if (splice_grow_spd(pipe, &spd)) return -ENOMEM; - index = *ppos >> PAGE_CACHE_SHIFT; - loff = *ppos & ~PAGE_CACHE_MASK; - req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + index = *ppos >> PAGE_SHIFT; + loff = *ppos & ~PAGE_MASK; + req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = min(req_pages, spd.nr_pages_max); /* @@ -365,7 +365,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(error)) { - page_cache_release(page); + put_page(page); if (error == -EEXIST) continue; break; @@ -385,7 +385,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * Now loop over the map and see if we need to start IO on any * pages, fill in the partial map, etc. */ - index = *ppos >> PAGE_CACHE_SHIFT; + index = *ppos >> PAGE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; for (page_nr = 0; page_nr < nr_pages; page_nr++) { @@ -397,7 +397,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, /* * this_len is the max we'll use from this page */ - this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + this_len = min_t(unsigned long, len, PAGE_SIZE - loff); page = spd.pages[page_nr]; if (PageReadahead(page)) @@ -426,7 +426,7 @@ retry_lookup: error = -ENOMEM; break; } - page_cache_release(spd.pages[page_nr]); + put_page(spd.pages[page_nr]); spd.pages[page_nr] = page; } /* @@ -456,7 +456,7 @@ fill_it: * i_size must be checked after PageUptodate. */ isize = i_size_read(mapping->host); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + end_index = (isize - 1) >> PAGE_SHIFT; if (unlikely(!isize || index > end_index)) break; @@ -470,7 +470,7 @@ fill_it: /* * max good bytes in this page */ - plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + plen = ((isize - 1) & ~PAGE_MASK) + 1; if (plen <= loff) break; @@ -494,8 +494,8 @@ fill_it: * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) - page_cache_release(spd.pages[page_nr++]); - in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + put_page(spd.pages[page_nr++]); + in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); @@ -636,8 +636,8 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, goto shrink_ret; } - offset = *ppos & ~PAGE_CACHE_MASK; - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_MASK; + nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { struct page *page; @@ -647,7 +647,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, if (!page) goto err; - this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); + this_len = min_t(size_t, len, PAGE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; spd.pages[i] = page; @@ -1143,6 +1143,9 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; + if (unlikely(len > MAX_RW_COUNT)) + len = MAX_RW_COUNT; + if (in->f_op->splice_read) splice_read = in->f_op->splice_read; else diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 0cea9b9236d0..2c2618410d51 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -181,11 +181,11 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, in = min(bytes, msblk->devblksize - offset); bytes -= in; while (in) { - if (pg_offset == PAGE_CACHE_SIZE) { + if (pg_offset == PAGE_SIZE) { data = squashfs_next_page(output); pg_offset = 0; } - avail = min_t(int, in, PAGE_CACHE_SIZE - + avail = min_t(int, in, PAGE_SIZE - pg_offset); memcpy(data + pg_offset, bh[k]->b_data + offset, avail); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 1cb70a0b2168..23813c078cc9 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -30,7 +30,7 @@ * access the metadata and fragment caches. * * To avoid out of memory and fragmentation issues with vmalloc the cache - * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. + * uses sequences of kmalloced PAGE_SIZE buffers. * * It should be noted that the cache is not used for file datablocks, these * are decompressed and cached in the page-cache in the normal way. The @@ -231,7 +231,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) /* * Initialise cache allocating the specified number of entries, each of * size block_size. To avoid vmalloc fragmentation issues each entry - * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers. + * is allocated as a sequence of kmalloced PAGE_SIZE buffers. */ struct squashfs_cache *squashfs_cache_init(char *name, int entries, int block_size) @@ -255,7 +255,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, cache->unused = entries; cache->entries = entries; cache->block_size = block_size; - cache->pages = block_size >> PAGE_CACHE_SHIFT; + cache->pages = block_size >> PAGE_SHIFT; cache->pages = cache->pages ? cache->pages : 1; cache->name = name; cache->num_waiters = 0; @@ -275,7 +275,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, } for (j = 0; j < cache->pages; j++) { - entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + entry->data[j] = kmalloc(PAGE_SIZE, GFP_KERNEL); if (entry->data[j] == NULL) { ERROR("Failed to allocate %s buffer\n", name); goto cleanup; @@ -314,10 +314,10 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, return min(length, entry->length - offset); while (offset < entry->length) { - void *buff = entry->data[offset / PAGE_CACHE_SIZE] - + (offset % PAGE_CACHE_SIZE); + void *buff = entry->data[offset / PAGE_SIZE] + + (offset % PAGE_SIZE); int bytes = min_t(int, entry->length - offset, - PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); + PAGE_SIZE - (offset % PAGE_SIZE)); if (bytes >= remaining) { memcpy(buffer, buff, remaining); @@ -415,7 +415,7 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, */ void *squashfs_read_table(struct super_block *sb, u64 block, int length) { - int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; int i, res; void *table, *buffer, **data; struct squashfs_page_actor *actor; @@ -436,7 +436,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length) goto failed2; } - for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) + for (i = 0; i < pages; i++, buffer += PAGE_SIZE) data[i] = buffer; res = squashfs_read_data(sb, block, length | diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index e9034bf6e5ae..d2bc13636f79 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -102,7 +102,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags) * Read decompressor specific options from file system if present */ if (SQUASHFS_COMP_OPTS(flags)) { - buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buffer == NULL) { comp_opts = ERR_PTR(-ENOMEM); goto out; diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index d8c2d747be28..a5845f94a2a1 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -231,6 +231,6 @@ failed_read: const struct file_operations squashfs_dir_ops = { .read = generic_read_dir, - .iterate = squashfs_readdir, - .llseek = default_llseek, + .iterate_shared = squashfs_readdir, + .llseek = generic_file_llseek, }; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e5c9689062ba..13d80947bf9e 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -175,7 +175,7 @@ static long long read_indexes(struct super_block *sb, int n, { int err, i; long long block = 0; - __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + __le32 *blist = kmalloc(PAGE_SIZE, GFP_KERNEL); if (blist == NULL) { ERROR("read_indexes: Failed to allocate block_list\n"); @@ -183,7 +183,7 @@ static long long read_indexes(struct super_block *sb, int n, } while (n) { - int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2); + int blocks = min_t(int, n, PAGE_SIZE >> 2); err = squashfs_read_metadata(sb, blist, start_block, offset, blocks << 2); @@ -377,19 +377,19 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, struct inode *inode = page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; void *pageaddr; - int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; int start_index = page->index & ~mask, end_index = start_index | mask; /* * Loop copying datablock into pages. As the datablock likely covers - * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly + * many PAGE_SIZE pages (default block size is 128 KiB) explicitly * grab the pages from the page cache, except for the page that we've * been called to fill. */ for (i = start_index; i <= end_index && bytes > 0; i++, - bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + bytes -= PAGE_SIZE, offset += PAGE_SIZE) { struct page *push_page; - int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0; + int avail = buffer ? min_t(int, bytes, PAGE_SIZE) : 0; TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); @@ -404,14 +404,14 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, pageaddr = kmap_atomic(push_page); squashfs_copy_data(pageaddr, buffer, offset, avail); - memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + memset(pageaddr + avail, 0, PAGE_SIZE - avail); kunmap_atomic(pageaddr); flush_dcache_page(push_page); SetPageUptodate(push_page); skip_page: unlock_page(push_page); if (i != page->index) - page_cache_release(push_page); + put_page(push_page); } } @@ -454,7 +454,7 @@ static int squashfs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int index = page->index >> (msblk->block_log - PAGE_SHIFT); int file_end = i_size_read(inode) >> msblk->block_log; int res; void *pageaddr; @@ -462,8 +462,8 @@ static int squashfs_readpage(struct file *file, struct page *page) TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", page->index, squashfs_i(inode)->start); - if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) + if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT)) goto out; if (index < file_end || squashfs_i(inode)->fragment_block == @@ -487,7 +487,7 @@ error_out: SetPageError(page); out: pageaddr = kmap_atomic(page); - memset(pageaddr, 0, PAGE_CACHE_SIZE); + memset(pageaddr, 0, PAGE_SIZE); kunmap_atomic(pageaddr); flush_dcache_page(page); if (!PageError(page)) diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 43e7a7eddac0..cb485d8e0e91 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -30,8 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) struct inode *inode = target_page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; - int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; + int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; int start_index = target_page->index & ~mask; int end_index = start_index | mask; int i, n, pages, missing_pages, bytes, res = -ENOMEM; @@ -68,7 +68,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) if (PageUptodate(page[i])) { unlock_page(page[i]); - page_cache_release(page[i]); + put_page(page[i]); page[i] = NULL; missing_pages++; } @@ -96,10 +96,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) goto mark_errored; /* Last page may have trailing bytes not filled */ - bytes = res % PAGE_CACHE_SIZE; + bytes = res % PAGE_SIZE; if (bytes) { pageaddr = kmap_atomic(page[pages - 1]); - memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); + memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); kunmap_atomic(pageaddr); } @@ -109,7 +109,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) SetPageUptodate(page[i]); unlock_page(page[i]); if (page[i] != target_page) - page_cache_release(page[i]); + put_page(page[i]); } kfree(actor); @@ -127,7 +127,7 @@ mark_errored: flush_dcache_page(page[i]); SetPageError(page[i]); unlock_page(page[i]); - page_cache_release(page[i]); + put_page(page[i]); } out: @@ -153,21 +153,21 @@ static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, } for (n = 0; n < pages && bytes > 0; n++, - bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { - int avail = min_t(int, bytes, PAGE_CACHE_SIZE); + bytes -= PAGE_SIZE, offset += PAGE_SIZE) { + int avail = min_t(int, bytes, PAGE_SIZE); if (page[n] == NULL) continue; pageaddr = kmap_atomic(page[n]); squashfs_copy_data(pageaddr, buffer, offset, avail); - memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + memset(pageaddr + avail, 0, PAGE_SIZE - avail); kunmap_atomic(pageaddr); flush_dcache_page(page[n]); SetPageUptodate(page[n]); unlock_page(page[n]); if (page[n] != target_page) - page_cache_release(page[n]); + put_page(page[n]); } out: diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index c31e2bc9c081..ff4468bd18b0 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -117,13 +117,13 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, data = squashfs_first_page(output); buff = stream->output; while (data) { - if (bytes <= PAGE_CACHE_SIZE) { + if (bytes <= PAGE_SIZE) { memcpy(data, buff, bytes); break; } - memcpy(data, buff, PAGE_CACHE_SIZE); - buff += PAGE_CACHE_SIZE; - bytes -= PAGE_CACHE_SIZE; + memcpy(data, buff, PAGE_SIZE); + buff += PAGE_SIZE; + bytes -= PAGE_SIZE; data = squashfs_next_page(output); } squashfs_finish_page(output); diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 244b9fbfff7b..934c17e96590 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -102,13 +102,13 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, data = squashfs_first_page(output); buff = stream->output; while (data) { - if (bytes <= PAGE_CACHE_SIZE) { + if (bytes <= PAGE_SIZE) { memcpy(data, buff, bytes); break; } else { - memcpy(data, buff, PAGE_CACHE_SIZE); - buff += PAGE_CACHE_SIZE; - bytes -= PAGE_CACHE_SIZE; + memcpy(data, buff, PAGE_SIZE); + buff += PAGE_SIZE; + bytes -= PAGE_SIZE; data = squashfs_next_page(output); } } diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 5a1c11f56441..9b7b1b6a7892 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -48,7 +48,7 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, if (actor == NULL) return NULL; - actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->length = length ? : pages * PAGE_SIZE; actor->buffer = buffer; actor->pages = pages; actor->next_page = 0; @@ -88,7 +88,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, if (actor == NULL) return NULL; - actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->length = length ? : pages * PAGE_SIZE; actor->page = page; actor->pages = pages; actor->next_page = 0; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 26dd82008b82..98537eab27e2 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -24,7 +24,7 @@ static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, if (actor == NULL) return NULL; - actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->length = length ? : pages * PAGE_SIZE; actor->page = page; actor->pages = pages; actor->next_page = 0; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 5e79bfa4f260..cf01e15a7b16 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -152,7 +152,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) * Check the system page size is not larger than the filesystem * block size (by default 128K). This is currently not supported. */ - if (PAGE_CACHE_SIZE > msblk->block_size) { + if (PAGE_SIZE > msblk->block_size) { ERROR("Page size > filesystem block size (%d). This is " "currently not supported!\n", msblk->block_size); goto failed_mount; diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index dbcc2f54bad4..d688ef42a6a1 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -48,10 +48,10 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page) struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; - int index = page->index << PAGE_CACHE_SHIFT; + int index = page->index << PAGE_SHIFT; u64 block = squashfs_i(inode)->start; int offset = squashfs_i(inode)->offset; - int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE); + int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE); int bytes, copied; void *pageaddr; struct squashfs_cache_entry *entry; @@ -94,7 +94,7 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page) copied = squashfs_copy_data(pageaddr + bytes, entry, offset, length - bytes); if (copied == length - bytes) - memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length); + memset(pageaddr + length, 0, PAGE_SIZE - length); else block = entry->next_index; kunmap_atomic(pageaddr); diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 1e9de96288d8..1548b3784548 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -214,10 +214,12 @@ failed: static int squashfs_xattr_handler_get(const struct xattr_handler *handler, - struct dentry *d, const char *name, + struct dentry *unused, + struct inode *inode, + const char *name, void *buffer, size_t size) { - return squashfs_xattr_get(d_inode(d), handler->flags, name, + return squashfs_xattr_get(inode, handler->flags, name, buffer, size); } diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index c609624e4b8a..6bfaef73d065 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -141,7 +141,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.in_pos = 0; stream->buf.in_size = 0; stream->buf.out_pos = 0; - stream->buf.out_size = PAGE_CACHE_SIZE; + stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); do { @@ -158,7 +158,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out = squashfs_next_page(output); if (stream->buf.out != NULL) { stream->buf.out_pos = 0; - total += PAGE_CACHE_SIZE; + total += PAGE_SIZE; } } diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 8727caba6882..2ec24d128bce 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -69,7 +69,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, int zlib_err, zlib_init = 0, k = 0; z_stream *stream = strm; - stream->avail_out = PAGE_CACHE_SIZE; + stream->avail_out = PAGE_SIZE; stream->next_out = squashfs_first_page(output); stream->avail_in = 0; @@ -85,7 +85,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); if (stream->next_out != NULL) - stream->avail_out = PAGE_CACHE_SIZE; + stream->avail_out = PAGE_SIZE; } if (!zlib_init) { diff --git a/fs/super.c b/fs/super.c index 74914b1bae70..d78b9847e6cb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -285,7 +285,7 @@ static void put_super(struct super_block *sb) * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * - * Drops an active reference to superblock, converting it into a temprory + * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. diff --git a/fs/sync.c b/fs/sync.c index dd5d1711c7ac..2a54c1f22035 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -302,7 +302,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, goto out; if (sizeof(pgoff_t) == 4) { - if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + if (offset >= (0x100000000ULL << PAGE_SHIFT)) { /* * The range starts outside a 32 bit machine's * pagecache addressing capabilities. Let it "succeed" @@ -310,7 +310,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, ret = 0; goto out; } - if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) { /* * Out to EOF */ diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 63c1bcb224ee..2661b77fc8a7 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -23,14 +23,14 @@ static int sysv_readdir(struct file *, struct dir_context *); const struct file_operations sysv_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = sysv_readdir, + .iterate_shared = sysv_readdir, .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) @@ -73,8 +73,8 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (pos >= inode->i_size) return 0; - offset = pos & ~PAGE_CACHE_MASK; - n = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_MASK; + n = pos >> PAGE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; @@ -85,7 +85,7 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) continue; kaddr = (char *)page_address(page); de = (struct sysv_dir_entry *)(kaddr+offset); - limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE; + limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE; for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { char *name = de->name; @@ -146,7 +146,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ if (!IS_ERR(page)) { kaddr = (char*)page_address(page); de = (struct sysv_dir_entry *) kaddr; - kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE; + kaddr += PAGE_SIZE - SYSV_DIRSIZE; for ( ; (char *) de <= kaddr ; de++) { if (!de->inode) continue; @@ -190,7 +190,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto out; kaddr = (char*)page_address(page); de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE; + kaddr += PAGE_SIZE - SYSV_DIRSIZE; while ((char *)de <= kaddr) { if (!de->inode) goto got_it; @@ -261,7 +261,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir) kmap(page); base = (char*)page_address(page); - memset(base, 0, PAGE_CACHE_SIZE); + memset(base, 0, PAGE_SIZE); de = (struct sysv_dir_entry *) base; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); @@ -273,7 +273,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir) kunmap(page); err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); fail: - page_cache_release(page); + put_page(page); return err; } @@ -296,7 +296,7 @@ int sysv_empty_dir(struct inode * inode) kaddr = (char *)page_address(page); de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE; + kaddr += PAGE_SIZE-SYSV_DIRSIZE; for ( ;(char *)de <= kaddr; de++) { if (!de->inode) diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 11e83ed0b4bf..90b60c03b588 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -264,11 +264,11 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, out_dir: if (dir_de) { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } out_old: kunmap(old_page); - page_cache_release(old_page); + put_page(old_page); out: return err; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 795992a8321e..4b86d3a738e1 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1182,10 +1182,10 @@ const struct inode_operations ubifs_dir_inode_operations = { .rename = ubifs_rename, .setattr = ubifs_setattr, .getattr = ubifs_getattr, - .setxattr = ubifs_setxattr, - .getxattr = ubifs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ubifs_listxattr, - .removexattr = ubifs_removexattr, + .removexattr = generic_removexattr, #ifdef CONFIG_UBIFS_ATIME_SUPPORT .update_time = ubifs_update_time, #endif @@ -1195,7 +1195,7 @@ const struct file_operations ubifs_dir_operations = { .llseek = generic_file_llseek, .release = ubifs_dir_release, .read = generic_read_dir, - .iterate = ubifs_readdir, + .iterate_shared = ubifs_readdir, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 065c88f8e4b8..08316972ff93 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -121,7 +121,7 @@ static int do_readpage(struct page *page) if (block >= beyond) { /* Reading beyond inode */ SetPageChecked(page); - memset(addr, 0, PAGE_CACHE_SIZE); + memset(addr, 0, PAGE_SIZE); goto out; } @@ -223,7 +223,7 @@ static int write_begin_slow(struct address_space *mapping, { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; struct ubifs_budget_req req = { .new_page = 1 }; int uninitialized_var(err), appending = !!(pos + len > inode->i_size); struct page *page; @@ -254,13 +254,13 @@ static int write_begin_slow(struct address_space *mapping, } if (!PageUptodate(page)) { - if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) SetPageChecked(page); else { err = do_readpage(page); if (err) { unlock_page(page); - page_cache_release(page); + put_page(page); ubifs_release_budget(c, &req); return err; } @@ -428,7 +428,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; int uninitialized_var(err), appending = !!(pos + len > inode->i_size); int skipped_read = 0; struct page *page; @@ -446,7 +446,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, if (!PageUptodate(page)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { + if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) { /* * We change whole page so no need to load it. But we * do not know whether this page exists on the media or @@ -462,7 +462,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, err = do_readpage(page); if (err) { unlock_page(page); - page_cache_release(page); + put_page(page); return err; } } @@ -494,7 +494,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, mutex_unlock(&ui->ui_mutex); } unlock_page(page); - page_cache_release(page); + put_page(page); return write_begin_slow(mapping, pos, len, pagep, flags); } @@ -549,12 +549,12 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", inode->i_ino, pos, page->index, len, copied, inode->i_size); - if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) { + if (unlikely(copied < len && len == PAGE_SIZE)) { /* * VFS copied less data to the page that it intended and * declared in its '->write_begin()' call via the @len * argument. If the page was not up-to-date, and @len was - * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did + * @PAGE_SIZE, the 'ubifs_write_begin()' function did * not load it from the media (for optimization reasons). This * means that part of the page contains garbage. So read the * page now. @@ -593,7 +593,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, out: unlock_page(page); - page_cache_release(page); + put_page(page); return copied; } @@ -621,10 +621,10 @@ static int populate_page(struct ubifs_info *c, struct page *page, addr = zaddr = kmap(page); - end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + end_index = (i_size - 1) >> PAGE_SHIFT; if (!i_size || page->index > end_index) { hole = 1; - memset(addr, 0, PAGE_CACHE_SIZE); + memset(addr, 0, PAGE_SIZE); goto out_hole; } @@ -673,7 +673,7 @@ static int populate_page(struct ubifs_info *c, struct page *page, } if (end_index == page->index) { - int len = i_size & (PAGE_CACHE_SIZE - 1); + int len = i_size & (PAGE_SIZE - 1); if (len && len < read) memset(zaddr + len, 0, read - len); @@ -773,7 +773,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, isize = i_size_read(inode); if (isize == 0) goto out_free; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = ((isize - 1) >> PAGE_SHIFT); for (page_idx = 1; page_idx < page_cnt; page_idx++) { pgoff_t page_offset = offset + page_idx; @@ -788,7 +788,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, if (!PageUptodate(page)) err = populate_page(c, page, bu, &n); unlock_page(page); - page_cache_release(page); + put_page(page); if (err) break; } @@ -905,7 +905,7 @@ static int do_writepage(struct page *page, int len) #ifdef UBIFS_DEBUG struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT); + ubifs_assert(page->index <= ui->synced_i_size >> PAGE_SHIFT); spin_unlock(&ui->ui_lock); #endif @@ -1001,8 +1001,8 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); loff_t i_size = i_size_read(inode), synced_i_size; - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - int err, len = i_size & (PAGE_CACHE_SIZE - 1); + pgoff_t end_index = i_size >> PAGE_SHIFT; + int err, len = i_size & (PAGE_SIZE - 1); void *kaddr; dbg_gen("ino %lu, pg %lu, pg flags %#lx", @@ -1021,7 +1021,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) /* Is the page fully inside @i_size? */ if (page->index < end_index) { - if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { + if (page->index >= synced_i_size >> PAGE_SHIFT) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; @@ -1034,7 +1034,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) * with this. */ } - return do_writepage(page, PAGE_CACHE_SIZE); + return do_writepage(page, PAGE_SIZE); } /* @@ -1045,7 +1045,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) * writes to that region are not written out to the file." */ kaddr = kmap_atomic(page); - memset(kaddr + len, 0, PAGE_CACHE_SIZE - len); + memset(kaddr + len, 0, PAGE_SIZE - len); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -1138,7 +1138,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, truncate_setsize(inode, new_size); if (offset) { - pgoff_t index = new_size >> PAGE_CACHE_SHIFT; + pgoff_t index = new_size >> PAGE_SHIFT; struct page *page; page = find_lock_page(inode->i_mapping, index); @@ -1157,9 +1157,9 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, clear_page_dirty_for_io(page); if (UBIFS_BLOCKS_PER_PAGE_SHIFT) offset = new_size & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); err = do_writepage(page, offset); - page_cache_release(page); + put_page(page); if (err) goto out_budg; /* @@ -1173,7 +1173,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * having to read it. */ unlock_page(page); - page_cache_release(page); + put_page(page); } } } @@ -1285,7 +1285,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, struct ubifs_info *c = inode->i_sb->s_fs_info; ubifs_assert(PagePrivate(page)); - if (offset || length < PAGE_CACHE_SIZE) + if (offset || length < PAGE_SIZE) /* Partial page remains dirty */ return; @@ -1597,10 +1597,10 @@ const struct address_space_operations ubifs_file_address_operations = { const struct inode_operations ubifs_file_inode_operations = { .setattr = ubifs_setattr, .getattr = ubifs_getattr, - .setxattr = ubifs_setxattr, - .getxattr = ubifs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ubifs_listxattr, - .removexattr = ubifs_removexattr, + .removexattr = generic_removexattr, #ifdef CONFIG_UBIFS_ATIME_SUPPORT .update_time = ubifs_update_time, #endif @@ -1611,10 +1611,10 @@ const struct inode_operations ubifs_symlink_inode_operations = { .get_link = simple_get_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, - .setxattr = ubifs_setxattr, - .getxattr = ubifs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ubifs_listxattr, - .removexattr = ubifs_removexattr, + .removexattr = generic_removexattr, #ifdef CONFIG_UBIFS_ATIME_SUPPORT .update_time = ubifs_update_time, #endif diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index f4fbc7b6b794..3cbb904a6d7d 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -28,8 +28,8 @@ #include "ubifs.h" #include <linux/slab.h> -#include <linux/random.h> #include <linux/math64.h> +#include <linux/uuid.h> /* * Default journal size in logical eraseblocks as a percent of total diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a233ba913be4..70349954e78b 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2040,6 +2040,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; + sb->s_xattr = ubifs_xattr_handlers; mutex_lock(&c->umount_mutex); err = mount_ubifs(c); @@ -2237,12 +2238,12 @@ static int __init ubifs_init(void) BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); /* - * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to + * We require that PAGE_SIZE is greater-than-or-equal-to * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ - if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { + if (PAGE_SIZE < UBIFS_BLOCK_SIZE) { pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", - current->pid, (unsigned int)PAGE_CACHE_SIZE); + current->pid, (unsigned int)PAGE_SIZE); return -EINVAL; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c2a57e193a81..ddf9f6b9eee2 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -37,6 +37,7 @@ #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/security.h> +#include <linux/xattr.h> #include "ubifs-media.h" /* Version of this UBIFS implementation */ @@ -46,8 +47,8 @@ #define UBIFS_SUPER_MAGIC 0x24051905 /* Number of UBIFS blocks per VFS page */ -#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE) -#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT) +#define UBIFS_BLOCKS_PER_PAGE (PAGE_SIZE / UBIFS_BLOCK_SIZE) +#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_SHIFT - UBIFS_BLOCK_SHIFT) /* "File system end of life" sequence number watermark */ #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL @@ -1732,12 +1733,8 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); /* xattr.c */ -int ubifs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, - size_t size); +extern const struct xattr_handler *ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); -int ubifs_removexattr(struct dentry *dentry, const char *name); int ubifs_init_security(struct inode *dentry, struct inode *inode, const struct qstr *qstr); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index b043e044121d..6c277eb6aef9 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -249,42 +249,6 @@ out_free: return err; } -/** - * check_namespace - check extended attribute name-space. - * @nm: extended attribute name - * - * This function makes sure the extended attribute name belongs to one of the - * supported extended attribute name-spaces. Returns name-space index in case - * of success and a negative error code in case of failure. - */ -static int check_namespace(const struct qstr *nm) -{ - int type; - - if (nm->len > UBIFS_MAX_NLEN) - return -ENAMETOOLONG; - - if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX, - XATTR_TRUSTED_PREFIX_LEN)) { - if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0') - return -EINVAL; - type = TRUSTED_XATTR; - } else if (!strncmp(nm->name, XATTR_USER_PREFIX, - XATTR_USER_PREFIX_LEN)) { - if (nm->name[XATTR_USER_PREFIX_LEN] == '\0') - return -EINVAL; - type = USER_XATTR; - } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) { - if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0') - return -EINVAL; - type = SECURITY_XATTR; - } else - return -EOPNOTSUPP; - - return type; -} - static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) { struct inode *inode; @@ -302,24 +266,23 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) return ERR_PTR(-EINVAL); } -static int setxattr(struct inode *host, const char *name, const void *value, - size_t size, int flags) +static int __ubifs_setxattr(struct inode *host, const char *name, + const void *value, size_t size, int flags) { struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; union ubifs_key key; - int err, type; + int err; ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; - type = check_namespace(&nm); - if (type < 0) - return type; + if (nm.len > UBIFS_MAX_NLEN) + return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); if (!xent) @@ -363,19 +326,10 @@ out_free: return err; } -int ubifs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static ssize_t __ubifs_getxattr(struct inode *host, const char *name, + void *buf, size_t size) { - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", - name, d_inode(dentry)->i_ino, dentry, size); - - return setxattr(d_inode(dentry), name, value, size, flags); -} - -ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, - size_t size) -{ - struct inode *inode, *host = d_inode(dentry); + struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_inode *ui; @@ -383,12 +337,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, union ubifs_key key; int err; - dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, - host->i_ino, dentry, size); - - err = check_namespace(&nm); - if (err < 0) - return err; + if (nm.len > UBIFS_MAX_NLEN) + return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); if (!xent) @@ -460,8 +410,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) lowest_xent_key(c, &key, host->i_ino); while (1) { - int type; - xent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(xent)) { err = PTR_ERR(xent); @@ -471,14 +419,10 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) nm.name = xent->name; nm.len = le16_to_cpu(xent->nlen); - type = check_namespace(&nm); - if (unlikely(type < 0)) { - err = type; - break; - } - /* Show trusted namespace only for "power" users */ - if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) { + if (strncmp(xent->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) || + capable(CAP_SYS_ADMIN)) { memcpy(buffer + written, nm.name, nm.len + 1); written += nm.len + 1; } @@ -538,22 +482,19 @@ out_cancel: return err; } -int ubifs_removexattr(struct dentry *dentry, const char *name) +static int __ubifs_removexattr(struct inode *host, const char *name) { - struct inode *inode, *host = d_inode(dentry); + struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; union ubifs_key key; int err; - dbg_gen("xattr '%s', ino %lu ('%pd')", name, - host->i_ino, dentry); ubifs_assert(inode_is_locked(host)); - err = check_namespace(&nm); - if (err < 0) - return err; + if (nm.len > UBIFS_MAX_NLEN) + return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); if (!xent) @@ -603,7 +544,7 @@ static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); - err = setxattr(inode, name, xattr->value, xattr->value_len, 0); + err = __ubifs_setxattr(inode, name, xattr->value, xattr->value_len, 0); kfree(name); if (err < 0) break; @@ -626,3 +567,53 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, } return err; } + +static int ubifs_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, + inode->i_ino, dentry, size); + + return __ubifs_getxattr(inode, name, buffer, size); +} + +static int ubifs_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = d_inode(dentry); + + dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", + name, inode->i_ino, dentry, size); + + if (value) + return __ubifs_setxattr(inode, name, value, size, flags); + else + return __ubifs_removexattr(inode, name); +} + +const struct xattr_handler ubifs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = ubifs_xattr_get, + .set = ubifs_xattr_set, +}; + +const struct xattr_handler ubifs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = ubifs_xattr_get, + .set = ubifs_xattr_set, +}; + +const struct xattr_handler ubifs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = ubifs_xattr_get, + .set = ubifs_xattr_set, +}; + +const struct xattr_handler *ubifs_xattr_handlers[] = { + &ubifs_user_xattr_handler, + &ubifs_trusted_xattr_handler, + &ubifs_security_xattr_handler, + NULL +}; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index b51b371b874a..4c5593abc553 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -202,7 +202,7 @@ out: const struct file_operations udf_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = udf_readdir, + .iterate_shared = udf_readdir, .unlocked_ioctl = udf_ioctl, .fsync = generic_file_fsync, }; diff --git a/fs/udf/file.c b/fs/udf/file.c index 1af98963d860..632570617327 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -46,7 +46,7 @@ static void __udf_adinicb_readpage(struct page *page) kaddr = kmap(page); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); - memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size); + memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size); flush_dcache_page(page); SetPageUptodate(page); kunmap(page); @@ -87,20 +87,19 @@ static int udf_adinicb_write_begin(struct file *file, { struct page *page; - if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE)) + if (WARN_ON_ONCE(pos >= PAGE_SIZE)) return -EIO; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) return -ENOMEM; *pagep = page; - if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) + if (!PageUptodate(page) && len != PAGE_SIZE) __udf_adinicb_readpage(page); return 0; } -static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* Fallback to buffered I/O. */ return 0; @@ -153,9 +152,7 @@ out: if (retval > 0) { mark_inode_dirty(inode); - err = generic_write_sync(file, iocb->ki_pos - retval, retval); - if (err < 0) - retval = err; + retval = generic_write_sync(iocb, retval); } return retval; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 166d3ed32c39..f323aff740ef 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -214,8 +214,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, return ret; } -static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -223,9 +222,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) - udf_write_failed(mapping, offset + count); + udf_write_failed(mapping, iocb->ki_pos + count); return ret; } @@ -287,7 +286,7 @@ int udf_expand_file_adinicb(struct inode *inode) if (!PageUptodate(page)) { kaddr = kmap(page); memset(kaddr + iinfo->i_lenAlloc, 0x00, - PAGE_CACHE_SIZE - iinfo->i_lenAlloc); + PAGE_SIZE - iinfo->i_lenAlloc); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, iinfo->i_lenAlloc); flush_dcache_page(page); @@ -319,7 +318,7 @@ int udf_expand_file_adinicb(struct inode *inode) inode->i_data.a_ops = &udf_adinicb_aops; up_write(&iinfo->i_data_sem); } - page_cache_release(page); + put_page(page); mark_inode_dirty(inode); return err; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index a2ba11eca995..c3e5c9679371 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1250,7 +1250,7 @@ static struct dentry *udf_get_parent(struct dentry *child) brelse(fibh.sbh); tloc = lelb_to_cpu(cfi.icb.extLocation); - inode = udf_iget(d_inode(child)->i_sb, &tloc); + inode = udf_iget(child->d_sb, &tloc); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/udf/super.c b/fs/udf/super.c index fa92fe839fda..5e2c8c814e1b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -78,6 +78,15 @@ #define VSD_FIRST_SECTOR_OFFSET 32768 #define VSD_MAX_SECTOR_OFFSET 0x800000 +/* + * Maximum number of Terminating Descriptor / Logical Volume Integrity + * Descriptor redirections. The chosen numbers are arbitrary - just that we + * hopefully don't limit any real use of rewritten inode on write-once media + * but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_TD_NESTING 64 +#define UDF_MAX_LVID_NESTING 1000 + enum { UDF_MAX_LINKS = 0xffff }; /* These are the "meat" - everything else is stuffing */ @@ -919,14 +928,14 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) #endif } - ret = udf_CS0toUTF8(outstr, 31, pvoldesc->volIdent, 32); + ret = udf_dstrCS0toUTF8(outstr, 31, pvoldesc->volIdent, 32); if (ret < 0) goto out_bh; strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); - ret = udf_CS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128); + ret = udf_dstrCS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128); if (ret < 0) goto out_bh; @@ -1541,42 +1550,52 @@ out_bh: } /* - * udf_load_logicalvolint - * + * Find the prevailing Logical Volume Integrity Descriptor. */ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc) { - struct buffer_head *bh = NULL; + struct buffer_head *bh, *final_bh; uint16_t ident; struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; + int indirections = 0; + + while (++indirections <= UDF_MAX_LVID_NESTING) { + final_bh = NULL; + while (loc.extLength > 0 && + (bh = udf_read_tagged(sb, loc.extLocation, + loc.extLocation, &ident))) { + if (ident != TAG_IDENT_LVID) { + brelse(bh); + break; + } + + brelse(final_bh); + final_bh = bh; - while (loc.extLength > 0 && - (bh = udf_read_tagged(sb, loc.extLocation, - loc.extLocation, &ident)) && - ident == TAG_IDENT_LVID) { - sbi->s_lvid_bh = bh; - lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + loc.extLength -= sb->s_blocksize; + loc.extLocation++; + } - if (lvid->nextIntegrityExt.extLength) - udf_load_logicalvolint(sb, - leea_to_cpu(lvid->nextIntegrityExt)); + if (!final_bh) + return; - if (sbi->s_lvid_bh != bh) - brelse(bh); - loc.extLength -= sb->s_blocksize; - loc.extLocation++; + brelse(sbi->s_lvid_bh); + sbi->s_lvid_bh = final_bh; + + lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data; + if (lvid->nextIntegrityExt.extLength == 0) + return; + + loc = leea_to_cpu(lvid->nextIntegrityExt); } - if (sbi->s_lvid_bh != bh) - brelse(bh); + + udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n", + UDF_MAX_LVID_NESTING); + brelse(sbi->s_lvid_bh); + sbi->s_lvid_bh = NULL; } -/* - * Maximum number of Terminating Descriptor redirections. The chosen number is - * arbitrary - just that we hopefully don't limit any real use of rewritten - * inode on write-once media but avoid looping for too long on corrupted media. - */ -#define UDF_MAX_TD_NESTING 64 /* * Process a main/reserve volume descriptor sequence. diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 1f32c7bd9f57..27b5335730c9 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -3,9 +3,7 @@ #include <linux/mutex.h> #include <linux/bitops.h> - -/* Since UDF 2.01 is ISO 13346 based... */ -#define UDF_SUPER_MAGIC 0x15013346 +#include <linux/magic.h> #define UDF_MAX_READ_VERSION 0x0250 #define UDF_MAX_WRITE_VERSION 0x0201 diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 972b70625614..263829ef1873 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -212,7 +212,7 @@ extern int udf_get_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); extern int udf_put_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); -extern int udf_CS0toUTF8(uint8_t *, int, const uint8_t *, int); +extern int udf_dstrCS0toUTF8(uint8_t *, int, const uint8_t *, int); /* ialloc.c */ extern void udf_free_inode(struct inode *); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 3ff42f4437f3..695389a4fc23 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -335,9 +335,21 @@ try_again: return u_len; } -int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len) +int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, + const uint8_t *ocu_i, int i_len) { - return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len, + int s_len = 0; + + if (i_len > 0) { + s_len = ocu_i[i_len - 1]; + if (s_len >= i_len) { + pr_err("incorrect dstring lengths (%d/%d)\n", + s_len, i_len); + return -EINVAL; + } + } + + return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, udf_uni2char_utf8, 0); } diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index dc5fae601c24..0447b949c7f5 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -237,7 +237,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, sector_t newb, struct page *locked_page) { const unsigned blks_per_page = - 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); + 1 << (PAGE_SHIFT - inode->i_blkbits); const unsigned mask = blks_per_page - 1; struct address_space * const mapping = inode->i_mapping; pgoff_t index, cur_index, last_index; @@ -255,9 +255,9 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, cur_index = locked_page->index; end = count + beg; - last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + last_index = end >> (PAGE_SHIFT - inode->i_blkbits); for (i = beg; i < end; i = (i | mask) + 1) { - index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + index = i >> (PAGE_SHIFT - inode->i_blkbits); if (likely(cur_index != index)) { page = ufs_get_locked_page(mapping, index); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 74f2e80288bf..57dcceda17d6 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -62,7 +62,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) static inline void ufs_put_page(struct page *page) { kunmap(page); - page_cache_release(page); + put_page(page); } ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) @@ -105,19 +105,19 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, } -static void ufs_check_page(struct page *page) +static bool ufs_check_page(struct page *page) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; char *kaddr = page_address(page); unsigned offs, rec_len; - unsigned limit = PAGE_CACHE_SIZE; + unsigned limit = PAGE_SIZE; const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; struct ufs_dir_entry *p; char *error; - if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_CACHE_MASK; + if ((dir->i_size >> PAGE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_MASK; if (limit & chunk_mask) goto Ebadsize; if (!limit) @@ -143,7 +143,7 @@ static void ufs_check_page(struct page *page) goto Eend; out: SetPageChecked(page); - return; + return true; /* Too bad, we had an error */ @@ -170,7 +170,7 @@ Einumber: bad_entry: ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - " "offset=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, rec_len, ufs_get_de_namlen(sb, p)); goto fail; Eend: @@ -178,10 +178,10 @@ Eend: ufs_error(sb, __func__, "entry in directory #%lu spans the page boundary" "offset=%lu", - dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs); + dir->i_ino, (page->index<<PAGE_SHIFT)+offs); fail: - SetPageChecked(page); SetPageError(page); + return false; } static struct page *ufs_get_page(struct inode *dir, unsigned long n) @@ -190,10 +190,10 @@ static struct page *ufs_get_page(struct inode *dir, unsigned long n) struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) { kmap(page); - if (!PageChecked(page)) - ufs_check_page(page); - if (PageError(page)) - goto fail; + if (unlikely(!PageChecked(page))) { + if (PageError(page) || !ufs_check_page(page)) + goto fail; + } } return page; @@ -211,9 +211,9 @@ ufs_last_byte(struct inode *inode, unsigned long page_nr) { unsigned last_byte = inode->i_size; - last_byte -= page_nr << PAGE_CACHE_SHIFT; - if (last_byte > PAGE_CACHE_SIZE) - last_byte = PAGE_CACHE_SIZE; + last_byte -= page_nr << PAGE_SHIFT; + if (last_byte > PAGE_SIZE) + last_byte = PAGE_SIZE; return last_byte; } @@ -341,7 +341,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) kaddr = page_address(page); dir_end = kaddr + ufs_last_byte(dir, n); de = (struct ufs_dir_entry *)kaddr; - kaddr += PAGE_CACHE_SIZE - reclen; + kaddr += PAGE_SIZE - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -432,8 +432,8 @@ ufs_readdir(struct file *file, struct dir_context *ctx) loff_t pos = ctx->pos; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - unsigned int offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned int offset = pos & ~PAGE_MASK; + unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = file->f_version != inode->i_version; @@ -454,14 +454,14 @@ ufs_readdir(struct file *file, struct dir_context *ctx) ufs_error(sb, __func__, "bad page in #%lu", inode->i_ino); - ctx->pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_SIZE - offset; return -EIO; } kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); - ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_SHIFT) + offset; } file->f_version = inode->i_version; need_revalidate = 0; @@ -574,7 +574,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) kmap(page); base = (char*)page_address(page); - memset(base, 0, PAGE_CACHE_SIZE); + memset(base, 0, PAGE_SIZE); de = (struct ufs_dir_entry *) base; @@ -594,7 +594,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) err = ufs_commit_chunk(page, 0, chunk_size); fail: - page_cache_release(page); + put_page(page); return err; } @@ -653,7 +653,7 @@ not_empty: const struct file_operations ufs_dir_operations = { .read = generic_read_dir, - .iterate = ufs_readdir, + .iterate_shared = ufs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index d897e169ab9c..9f49431e798d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1051,13 +1051,13 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) lastfrag--; lastpage = ufs_get_locked_page(mapping, lastfrag >> - (PAGE_CACHE_SHIFT - inode->i_blkbits)); + (PAGE_SHIFT - inode->i_blkbits)); if (IS_ERR(lastpage)) { err = -EIO; goto out; } - end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); + end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1); bh = page_buffers(lastpage); for (i = 0; i < end; ++i) bh = bh->b_this_page; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index acf4a3b61b81..a1559f762805 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -305,7 +305,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); else { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } inode_dec_link_count(old_dir); } @@ -315,11 +315,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, out_dir: if (dir_de) { kunmap(dir_page); - page_cache_release(dir_page); + put_page(dir_page); } out_old: kunmap(old_page); - page_cache_release(old_page); + put_page(old_page); out: return err; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 442fd52ebffe..f04ab232d08d 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -132,7 +132,7 @@ static struct dentry *ufs_get_parent(struct dentry *child) ino = ufs_inode_by_name(d_inode(child), &dot_dot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino)); + return d_obtain_alias(ufs_iget(child->d_sb, ino)); } static const struct export_operations ufs_export_ops = { diff --git a/fs/ufs/util.c b/fs/ufs/util.c index b6c2f94e041e..a409e3e7827a 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -261,14 +261,14 @@ struct page *ufs_get_locked_page(struct address_space *mapping, if (unlikely(page->mapping == NULL)) { /* Truncate got there first */ unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; goto out; } if (!PageUptodate(page) || PageError(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); printk(KERN_ERR "ufs_change_blocknr: " "can not read page: ino %lu, index: %lu\n", diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 954175928240..b7fbf53dbc81 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -283,7 +283,7 @@ extern struct page *ufs_get_locked_page(struct address_space *mapping, static inline void ufs_put_locked_page(struct page *page) { unlock_page(page); - page_cache_release(page); + put_page(page); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 66cdb44616d5..2d97952e341a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); - mmput(ctx->mm); + mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } } @@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file) ACCESS_ONCE(ctx->released) = true; + if (!mmget_not_zero(mm)) + goto wakeup; + /* * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if @@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } up_write(&mm->mmap_sem); - + mmput(mm); +wakeup: /* * After no new page faults can wait on this fault_*wqh, flush * the last page faults that may have been already waiting on @@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = uffdio_register.range.start; end = start + uffdio_register.range.len; + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + down_write(&mm->mmap_sem); vma = find_vma_prev(mm, start, &prev); - - ret = -ENOMEM; if (!vma) goto out_unlock; @@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } while (vma && vma->vm_start < end); out_unlock: up_write(&mm->mmap_sem); + mmput(mm); if (!ret) { /* * Now that we scanned all vmas we can already tell @@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, start = uffdio_unregister.start; end = start + uffdio_unregister.len; + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + down_write(&mm->mmap_sem); vma = find_vma_prev(mm, start, &prev); - - ret = -ENOMEM; if (!vma) goto out_unlock; @@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } while (vma && vma->vm_start < end); out_unlock: up_write(&mm->mmap_sem); + mmput(mm); out: return ret; } @@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) goto out; - - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + if (mmget_not_zero(ctx->mm)) { + ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len); + mmput(ctx->mm); + } if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; if (ret < 0) @@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) goto out; - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + if (mmget_not_zero(ctx->mm)) { + ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + mmput(ctx->mm); + } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; if (ret < 0) @@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags) ctx->released = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ - atomic_inc(&ctx->mm->mm_users); + atomic_inc(&ctx->mm->mm_count); file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) { - mmput(ctx->mm); + mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } out: diff --git a/fs/xattr.c b/fs/xattr.c index 4861322e28e8..fc81e771488a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -192,7 +192,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, if (!inode->i_op->getxattr) return -EOPNOTSUPP; - error = inode->i_op->getxattr(dentry, name, NULL, 0); + error = inode->i_op->getxattr(dentry, inode, name, NULL, 0); if (error < 0) return error; @@ -203,7 +203,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, memset(value, 0, error + 1); } - error = inode->i_op->getxattr(dentry, name, value, error); + error = inode->i_op->getxattr(dentry, inode, name, value, error); *xattr_value = value; return error; } @@ -236,7 +236,7 @@ vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) } nolsm: if (inode->i_op->getxattr) - error = inode->i_op->getxattr(dentry, name, value, size); + error = inode->i_op->getxattr(dentry, inode, name, value, size); else error = -EOPNOTSUPP; @@ -655,6 +655,7 @@ strcmp_prefix(const char *a, const char *a_prefix) * operations to the correct xattr_handler. */ #define for_each_xattr_handler(handlers, handler) \ + if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) @@ -668,7 +669,7 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name) const struct xattr_handler *handler; if (!*name) - return NULL; + return ERR_PTR(-EINVAL); for_each_xattr_handler(handlers, handler) { const char *n; @@ -691,14 +692,16 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name) * Find the handler for the prefix and dispatch its get() operation. */ ssize_t -generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) +generic_getxattr(struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->get(handler, dentry, name, buffer, size); + return handler->get(handler, dentry, inode, + name, buffer, size); } /* diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 686ba6fb20dd..339c696bbc01 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) } void * -kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, - xfs_km_flags_t flags) +kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { - void *new; + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; - new = kmem_alloc(newsize, flags); - if (ptr) { - if (new) - memcpy(new, ptr, - ((oldsize < newsize) ? oldsize : newsize)); - kmem_free(ptr); - } - return new; + do { + ptr = krealloc(old, newsize, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", + current->comm, current->pid, + newsize, __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); } void * diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index d1c66e465ca5..689f746224e7 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fa3b948ef9c2..4e126f41a0aa 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -242,37 +242,21 @@ xfs_attr_set( return error; } - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (rsvd) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(args.trans, &tres, args.total, 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &tres, args.total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); + if (error) return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); @@ -429,31 +413,15 @@ xfs_attr_remove( return error; /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); - - /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (flags & ATTR_ROOT) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0, + (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, + &args.trans); + if (error) return error; - } xfs_ilock(dp, XFS_ILOCK_EXCL); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 041b6948aecc..932381caef1b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1121,15 +1121,14 @@ xfs_bmap_add_attrfork( mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : @@ -3742,11 +3741,11 @@ xfs_bmap_btalloc( args.prod = align; if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); - } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { + } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { args.prod = 1; args.mod = 0; } else { - args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; + args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } @@ -6026,13 +6025,10 @@ xfs_bmap_split_extent( xfs_fsblock_t firstfsb; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 974d62e677f4..e5bb9cc3b243 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -257,15 +257,12 @@ xfs_dir2_block_to_sf( * * Convert the inode to local format and copy the data in. */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); + xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size); + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_d.di_size = size; logflags |= XFS_ILOG_DDATA; - memcpy(dp->i_df.if_u1.if_data, dst, size); - dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 11faf7df14c8..bbcc8c7a44b3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -231,6 +231,48 @@ xfs_iformat_fork( return error; } +void +xfs_init_local_fork( + struct xfs_inode *ip, + int whichfork, + const void *data, + int size) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int mem_size = size, real_size = 0; + bool zero_terminate; + + /* + * If we are using the local fork to store a symlink body we need to + * zero-terminate it so that we can pass it back to the VFS directly. + * Overallocate the in-memory fork by one for that and add a zero + * to terminate it below. + */ + zero_terminate = S_ISLNK(VFS_I(ip)->i_mode); + if (zero_terminate) + mem_size++; + + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (mem_size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(mem_size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + + if (size) { + memcpy(ifp->if_u1.if_data, data, size); + if (zero_terminate) + ifp->if_u1.if_data[size] = '\0'; + } + + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ifp->if_flags |= XFS_IFINLINE; +} + /* * The file is in-lined in the on-disk inode. * If it fits into if_inline_data, then copy @@ -248,8 +290,6 @@ xfs_iformat_local( int whichfork, int size) { - xfs_ifork_t *ifp; - int real_size; /* * If the size is unreasonable, then something @@ -265,22 +305,8 @@ xfs_iformat_local( ip->i_mount, dip); return -EFSCORRUPTED; } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; + + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -516,7 +542,6 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -660,7 +685,6 @@ xfs_idata_realloc( ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, real_size, - ifp->if_real_bytes, KM_SLEEP | KM_NOFS); } } else { @@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct( if (rnew_size != ifp->if_real_bytes) { ifp->if_u1.if_extents = kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, KM_NOFS); + rnew_size, KM_NOFS); } if (rnew_size > ifp->if_real_bytes) { memset(&ifp->if_u1.if_extents[ifp->if_bytes / @@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect( if (new_size == 0) { xfs_iext_destroy(ifp); } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_NOFS); + ifp->if_u1.if_ext_irec = + kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS); } } @@ -1497,6 +1519,24 @@ xfs_iext_indirect_to_direct( } /* + * Remove all records from the indirection array. + */ +STATIC void +xfs_iext_irec_remove_all( + struct xfs_ifork *ifp) +{ + int nlists; + int i; + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = 0; i < nlists; i++) + kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf); + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; +} + +/* * Free incore file extents. */ void @@ -1504,14 +1544,7 @@ xfs_iext_destroy( xfs_ifork_t *ifp) /* inode fork pointer */ { if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; + xfs_iext_irec_remove_all(ifp); } else if (ifp->if_real_bytes) { kmem_free(ifp->if_u1.if_extents); } else if (ifp->if_bytes) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7d3b1ed6dcbe..f95e072ae646 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -134,6 +134,7 @@ void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); +void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index d54a8018b079..e8f49c029ff0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -212,6 +212,11 @@ typedef struct xfs_trans_header { #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ /* + * The only type valid for th_type in CIL-enabled file system logs: + */ +#define XFS_TRANS_CHECKPOINT 40 + +/* * Log item types. */ #define XFS_LI_EFI 0x1236 diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 8a53eaa349f4..12ca86778e02 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -838,12 +838,10 @@ xfs_sync_sb( struct xfs_trans *tp; int error; - tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, + XFS_TRANS_NO_WRITECOUNT, &tp); + if (error) return error; - } xfs_log_sb(tp); if (wait) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 81ac870834da..16002b5ec4eb 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -56,103 +56,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; /* - * Transaction types. Used to distinguish types of buffers. These never reach - * the log. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -/* 17 was XFS_TRANS_WRITE_SYNC */ -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_SB_CHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_FSYNC_TS 35 -#define XFS_TRANS_GROWFSRT_ALLOC 36 -#define XFS_TRANS_GROWFSRT_ZERO 37 -#define XFS_TRANS_GROWFSRT_FREE 38 -#define XFS_TRANS_SWAPEXT 39 -#define XFS_TRANS_CHECKPOINT 40 -#define XFS_TRANS_ICREATE 41 -#define XFS_TRANS_CREATE_TMPFILE 42 -#define XFS_TRANS_TYPE_MAX 43 -/* new transaction types need to be reflected in xfs_logprint(8) */ - -#define XFS_TRANS_TYPES \ - { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ - { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ - { XFS_TRANS_INACTIVE, "INACTIVE" }, \ - { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ - { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ - { XFS_TRANS_REMOVE, "REMOVE" }, \ - { XFS_TRANS_LINK, "LINK" }, \ - { XFS_TRANS_RENAME, "RENAME" }, \ - { XFS_TRANS_MKDIR, "MKDIR" }, \ - { XFS_TRANS_RMDIR, "RMDIR" }, \ - { XFS_TRANS_SYMLINK, "SYMLINK" }, \ - { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ - { XFS_TRANS_GROWFS, "GROWFS" }, \ - { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ - { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ - { XFS_TRANS_WRITEID, "WRITEID" }, \ - { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ - { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ - { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ - { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ - { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ - { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ - { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ - { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ - { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ - { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ - { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ - { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ - { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ - { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ - { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ - { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ - { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_ICREATE, "ICREATE" }, \ - { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ - { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } - -/* * This structure is used to track log items associated with * a transaction. It points to the log item and keeps some * flags to track the state of the log item. It also tracks @@ -181,8 +84,9 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer - count in superblock */ +#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */ + /* * Field values for xfs_trans_mod_sb. */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 2d5df1f23bbc..b6e527b8eccb 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -158,22 +158,14 @@ xfs_get_acl(struct inode *inode, int type) if (error) { /* * If the attribute doesn't exist make sure we have a negative - * cache entry, for any other error assume it is transient and - * leave the cache entry as ACL_NOT_CACHED. + * cache entry, for any other error assume it is transient. */ - if (error == -ENOATTR) - goto out_update_cache; - acl = ERR_PTR(error); - goto out; + if (error != -ENOATTR) + acl = ERR_PTR(error); + } else { + acl = xfs_acl_from_disk(xfs_acl, len, + XFS_ACL_MAX_ENTRIES(ip->i_mount)); } - - acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount)); - if (IS_ERR(acl)) - goto out; - -out_update_cache: - set_cached_acl(inode, type, acl); -out: kmem_free(xfs_acl); return acl; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index d445a64b979e..4c463b99fe57 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -84,23 +84,71 @@ xfs_find_bdev_for_inode( } /* - * We're now finished for good with this ioend structure. - * Update the page state via the associated buffer_heads, - * release holds on the inode and bio, and finally free - * up memory. Do not use the ioend after this. + * We're now finished for good with this page. Update the page state via the + * associated buffer_heads, paying attention to the start and end offsets that + * we need to process on the page. + */ +static void +xfs_finish_page_writeback( + struct inode *inode, + struct bio_vec *bvec, + int error) +{ + unsigned int end = bvec->bv_offset + bvec->bv_len - 1; + struct buffer_head *head, *bh; + unsigned int off = 0; + + ASSERT(bvec->bv_offset < PAGE_SIZE); + ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); + ASSERT(end < PAGE_SIZE); + ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0); + + bh = head = page_buffers(bvec->bv_page); + + do { + if (off < bvec->bv_offset) + goto next_bh; + if (off > end) + break; + bh->b_end_io(bh, !error); +next_bh: + off += bh->b_size; + } while ((bh = bh->b_this_page) != head); +} + +/* + * We're now finished for good with this ioend structure. Update the page + * state, release holds on bios, and finally free up memory. Do not use the + * ioend after this. */ STATIC void xfs_destroy_ioend( - xfs_ioend_t *ioend) + struct xfs_ioend *ioend, + int error) { - struct buffer_head *bh, *next; + struct inode *inode = ioend->io_inode; + struct bio *last = ioend->io_bio; + struct bio *bio, *next; - for (bh = ioend->io_buffer_head; bh; bh = next) { - next = bh->b_private; - bh->b_end_io(bh, !ioend->io_error); - } + for (bio = &ioend->io_inline_bio; bio; bio = next) { + struct bio_vec *bvec; + int i; + + /* + * For the last bio, bi_private points to the ioend, so we + * need to explicitly end the iteration here. + */ + if (bio == last) + next = NULL; + else + next = bio->bi_private; - mempool_free(ioend, xfs_ioend_pool); + /* walk each page on bio, ending page IO on them */ + bio_for_each_segment_all(bvec, bio, i) + xfs_finish_page_writeback(inode, bvec, error); + + bio_put(bio); + } } /* @@ -120,13 +168,9 @@ xfs_setfilesize_trans_alloc( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } ioend->io_append_trans = tp; @@ -174,7 +218,8 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend) + struct xfs_ioend *ioend, + int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_trans *tp = ioend->io_append_trans; @@ -188,53 +233,32 @@ xfs_setfilesize_ioend( __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ - if (ioend->io_error) { + if (error) { xfs_trans_cancel(tp); - return ioend->io_error; + return error; } return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } /* - * Schedule IO completion handling on the final put of an ioend. - * - * If there is no work to do we might as well call it a day and free the - * ioend right now. - */ -STATIC void -xfs_finish_ioend( - struct xfs_ioend *ioend) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - - if (ioend->io_type == XFS_IO_UNWRITTEN) - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) - queue_work(mp->m_data_workqueue, &ioend->io_work); - else - xfs_destroy_ioend(ioend); - } -} - -/* * IO write completion. */ STATIC void xfs_end_io( struct work_struct *work) { - xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); - int error = 0; + struct xfs_ioend *ioend = + container_of(work, struct xfs_ioend, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = ioend->io_bio->bi_error; /* * Set an error if the mount has shut down and proceed with end I/O * processing so it can perform whatever cleanups are necessary. */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - ioend->io_error = -EIO; + error = -EIO; /* * For unwritten extents we need to issue transactions to convert a @@ -244,55 +268,33 @@ xfs_end_io( * on error. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { - if (ioend->io_error) + if (error) goto done; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); } else if (ioend->io_append_trans) { - error = xfs_setfilesize_ioend(ioend); + error = xfs_setfilesize_ioend(ioend, error); } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: - if (error) - ioend->io_error = error; - xfs_destroy_ioend(ioend); + xfs_destroy_ioend(ioend, error); } -/* - * Allocate and initialise an IO completion structure. - * We need to track unwritten extent write completion here initially. - * We'll need to extend this for updating the ondisk inode size later - * (vs. incore size). - */ -STATIC xfs_ioend_t * -xfs_alloc_ioend( - struct inode *inode, - unsigned int type) +STATIC void +xfs_end_bio( + struct bio *bio) { - xfs_ioend_t *ioend; - - ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); - - /* - * Set the count to 1 initially, which will prevent an I/O - * completion callback from happening before we have started - * all the I/O from calling the completion routine too early. - */ - atomic_set(&ioend->io_remaining, 1); - ioend->io_error = 0; - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; - ioend->io_inode = inode; - ioend->io_buffer_head = NULL; - ioend->io_buffer_tail = NULL; - ioend->io_offset = 0; - ioend->io_size = 0; - ioend->io_append_trans = NULL; + struct xfs_ioend *ioend = bio->bi_private; + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - INIT_WORK(&ioend->io_work, xfs_end_io); - return ioend; + if (ioend->io_type == XFS_IO_UNWRITTEN) + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend, bio->bi_error); } STATIC int @@ -364,50 +366,6 @@ xfs_imap_valid( offset < imap->br_startoff + imap->br_blockcount; } -/* - * BIO completion handler for buffered IO. - */ -STATIC void -xfs_end_bio( - struct bio *bio) -{ - xfs_ioend_t *ioend = bio->bi_private; - - if (!ioend->io_error) - ioend->io_error = bio->bi_error; - - /* Toss bio and pass work off to an xfsdatad thread */ - bio->bi_private = NULL; - bio->bi_end_io = NULL; - bio_put(bio); - - xfs_finish_ioend(ioend); -} - -STATIC void -xfs_submit_ioend_bio( - struct writeback_control *wbc, - xfs_ioend_t *ioend, - struct bio *bio) -{ - atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; - bio->bi_end_io = xfs_end_bio; - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); -} - -STATIC struct bio * -xfs_alloc_ioend_bio( - struct buffer_head *bh) -{ - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - - ASSERT(bio->bi_private == NULL); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - return bio; -} - STATIC void xfs_start_buffer_writeback( struct buffer_head *bh) @@ -452,28 +410,35 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) } /* - * Submit all of the bios for an ioend. We are only passed a single ioend at a - * time; the caller is responsible for chaining prior to submission. + * Submit the bio for an ioend. We are passed an ioend with a bio attached to + * it, and we submit that bio. The ioend may be used for multiple bio + * submissions, so we only want to allocate an append transaction for the ioend + * once. In the case of multiple bio submission, each bio will take an IO + * reference to the ioend to ensure that the ioend completion is only done once + * all bios have been submitted and the ioend is really done. * * If @fail is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the ioend chain rather - * than submit it to IO. This typically only happens on a filesystem shutdown. + * and unlocked them. In this situation, we need to fail the bio and ioend + * rather than submit it to IO. This typically only happens on a filesystem + * shutdown. */ STATIC int xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend, + struct xfs_ioend *ioend, int status) { - struct buffer_head *bh; - struct bio *bio; - sector_t lastblock = 0; - /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + ioend->io_type != XFS_IO_UNWRITTEN && + xfs_ioend_is_append(ioend) && + !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); + + ioend->io_bio->bi_private = ioend; + ioend->io_bio->bi_end_io = xfs_end_bio; + /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -481,33 +446,73 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_error = status; - xfs_finish_ioend(ioend); + ioend->io_bio->bi_error = status; + bio_endio(ioend->io_bio); return status; } - bio = NULL; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, + ioend->io_bio); + return 0; +} - if (!bio) { -retry: - bio = xfs_alloc_ioend_bio(bh); - } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static void +xfs_init_bio_from_bh( + struct bio *bio, + struct buffer_head *bh) +{ + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; +} - if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static struct xfs_ioend * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type, + xfs_off_t offset, + struct buffer_head *bh) +{ + struct xfs_ioend *ioend; + struct bio *bio; - lastblock = bh->b_blocknr; - } - if (bio) - xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend); - return 0; + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + xfs_init_bio_from_bh(bio, bh); + + ioend = container_of(bio, struct xfs_ioend, io_inline_bio); + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_size = 0; + ioend->io_offset = offset; + INIT_WORK(&ioend->io_work, xfs_end_io); + ioend->io_append_trans = NULL; + ioend->io_bio = bio; + return ioend; +} + +/* + * Allocate a new bio, and chain the old bio to the new one. + * + * Note that we have to do perform the chaining in this unintuitive order + * so that the bi_private linkage is set up in the right direction for the + * traversal in xfs_destroy_ioend(). + */ +static void +xfs_chain_bio( + struct xfs_ioend *ioend, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + struct bio *new; + + new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + xfs_init_bio_from_bh(new, bh); + + bio_chain(ioend->io_bio, new); + bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, + ioend->io_bio); + ioend->io_bio = new; } /* @@ -523,27 +528,24 @@ xfs_add_to_ioend( struct buffer_head *bh, xfs_off_t offset, struct xfs_writepage_ctx *wpc, + struct writeback_control *wbc, struct list_head *iolist) { if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || bh->b_blocknr != wpc->last_block + 1 || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - struct xfs_ioend *new; - if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - - new = xfs_alloc_ioend(inode, wpc->io_type); - new->io_offset = offset; - new->io_buffer_head = bh; - new->io_buffer_tail = bh; - wpc->ioend = new; - } else { - wpc->ioend->io_buffer_tail->b_private = bh; - wpc->ioend->io_buffer_tail = bh; + wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh); } - bh->b_private = NULL; + /* + * If the buffer doesn't fit into the bio we need to allocate a new + * one. This shouldn't happen more than once for a given buffer. + */ + while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) + xfs_chain_bio(wpc->ioend, wbc, bh); + wpc->ioend->io_size += bh->b_size; wpc->last_block = bh->b_blocknr; xfs_start_buffer_writeback(bh); @@ -704,7 +706,7 @@ next_buffer: xfs_iunlock(ip, XFS_ILOCK_EXCL); out_invalidate: - xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); + xfs_vm_invalidatepage(page, 0, PAGE_SIZE); return; } @@ -803,7 +805,7 @@ xfs_writepage_map( lock_buffer(bh); if (wpc->io_type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &wpc->imap, offset); - xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); + xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); count++; } @@ -925,9 +927,9 @@ xfs_do_writepage( * ---------------------------------^------------------| */ offset = i_size_read(inode); - end_index = offset >> PAGE_CACHE_SHIFT; + end_index = offset >> PAGE_SHIFT; if (page->index < end_index) - end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; else { /* * Check whether the page to write out is beyond or straddles @@ -940,7 +942,7 @@ xfs_do_writepage( * | | Straddles | * ---------------------------------^-----------|--------| */ - unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); + unsigned offset_into_page = offset & (PAGE_SIZE - 1); /* * Skip the page if it is fully outside i_size, e.g. due to a @@ -971,7 +973,7 @@ xfs_do_writepage( * memory is zeroed when mapped, and writes to that region are * not written out to the file." */ - zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + zero_user_segment(page, offset_into_page, PAGE_SIZE); /* Adjust the end_offset to the end of file */ end_offset = offset; @@ -1391,13 +1393,10 @@ xfs_end_io_direct_write( trace_xfs_end_io_direct_write_append(ip, offset, size); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); - return error; - } - error = xfs_setfilesize(ip, tp, offset, size); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, + &tp); + if (!error) + error = xfs_setfilesize(ip, tp, offset, size); } return error; @@ -1406,8 +1405,7 @@ xfs_end_io_direct_write( STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, - struct iov_iter *iter, - loff_t offset) + struct iov_iter *iter) { struct inode *inode = iocb->ki_filp->f_mapping->host; dio_iodone_t *endio = NULL; @@ -1420,12 +1418,12 @@ xfs_vm_direct_IO( } if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, offset, + return dax_do_io(iocb, inode, iter, xfs_get_blocks_direct, endio, 0); } bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + return __blockdev_direct_IO(iocb, inode, bdev, iter, xfs_get_blocks_direct, endio, NULL, flags); } @@ -1475,7 +1473,7 @@ xfs_vm_write_failed( loff_t block_offset; loff_t block_start; loff_t block_end; - loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t from = pos & (PAGE_SIZE - 1); loff_t to = from + len; struct buffer_head *bh, *head; struct xfs_mount *mp = XFS_I(inode)->i_mount; @@ -1491,7 +1489,7 @@ xfs_vm_write_failed( * start of the page by using shifts rather than masks the mismatch * problem. */ - block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; ASSERT(block_offset + from == pos); @@ -1558,12 +1556,12 @@ xfs_vm_write_begin( struct page **pagep, void **fsdata) { - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; struct page *page; int status; struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - ASSERT(len <= PAGE_CACHE_SIZE); + ASSERT(len <= PAGE_SIZE); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -1592,7 +1590,7 @@ xfs_vm_write_begin( truncate_pagecache_range(inode, start, pos + len); } - page_cache_release(page); + put_page(page); page = NULL; } @@ -1620,7 +1618,7 @@ xfs_vm_write_end( { int ret; - ASSERT(len <= PAGE_CACHE_SIZE); + ASSERT(len <= PAGE_SIZE); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (unlikely(ret < len)) { diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b4421177b68d..814aab790713 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,7 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern mempool_t *xfs_ioend_pool; +extern struct bio_set *xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. @@ -37,22 +37,19 @@ enum { { XFS_IO_OVERWRITE, "overwrite" } /* - * xfs_ioend struct manages large extent writes for XFS. - * It can manage several multi-page bio's at once. + * Structure for buffered I/O completions. */ -typedef struct xfs_ioend { +struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ unsigned int io_type; /* delalloc / unwritten */ - int io_error; /* I/O error code */ - atomic_t io_remaining; /* hold count */ struct inode *io_inode; /* file being written to */ - struct buffer_head *io_buffer_head;/* buffer linked list head */ - struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ -} xfs_ioend_t; + struct bio *io_bio; /* bio being built */ + struct bio io_inline_bio; /* MUST BE LAST! */ +}; extern const struct address_space_operations xfs_address_space_operations; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index dd4824589470..e3da5d448bcf 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ +/* Return 0 on success, or -errno; other state communicated via *context */ typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int, unsigned char *); + unsigned char *, int, int); typedef struct xfs_attr_list_context { struct xfs_inode *dp; /* inode */ @@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context { int firstu; /* first used byte in buffer */ int flags; /* from VOP call */ int resynch; /* T/F: resynch with cursor */ - int put_value; /* T/F: need value for listent */ put_listent_func_t put_listent; /* list output fmt function */ int index; /* index into output buffer */ } xfs_attr_list_context_t; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 2bb959ada45b..55d214981ed2 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -405,21 +405,11 @@ xfs_attr_inactive( goto out_destroy_fork; xfs_iunlock(dp, lock_mode); - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ lock_mode = 0; - trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); - error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans); if (error) - goto out_cancel; + goto out_destroy_fork; lock_mode = XFS_ILOCK_EXCL; xfs_ilock(dp, lock_mode); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 4fa14820e2e2..d25f26b22ac9 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sfe->flags, sfe->nameval, (int)sfe->namelen, - (int)sfe->valuelen, - &sfe->nameval[sfe->namelen]); - + (int)sfe->valuelen); + if (error) + return error; /* * Either search callback finished early or * didn't fit it all in the buffer after all. */ if (context->seen_enough) break; - - if (error) - return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } trace_xfs_attr_list_sf_all(context); @@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sbp->flags, sbp->name, sbp->namelen, - sbp->valuelen, - &sbp->name[sbp->namelen]); + sbp->valuelen); if (error) { kmem_free(sbuf); return error; @@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int( */ retval = 0; for (; i < ichdr.count; entry++, i++) { + char *name; + int namelen, valuelen; + if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int( continue; /* skip incomplete entries */ if (entry->flags & XFS_ATTR_LOCAL) { - xfs_attr_leaf_name_local_t *name_loc = - xfs_attr3_leaf_name_local(leaf, i); - - retval = context->put_listent(context, - entry->flags, - name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen), - &name_loc->nameval[name_loc->namelen]); - if (retval) - return retval; + xfs_attr_leaf_name_local_t *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + valuelen = be16_to_cpu(name_loc->valuelen); } else { - xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr3_leaf_name_remote(leaf, i); - - int valuelen = be32_to_cpu(name_rmt->valuelen); - - if (context->put_value) { - xfs_da_args_t args; - - memset((char *)&args, 0, sizeof(args)); - args.geo = context->dp->i_mount->m_attr_geo; - args.dp = context->dp; - args.whichfork = XFS_ATTR_FORK; - args.valuelen = valuelen; - args.rmtvaluelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); - args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = xfs_attr3_rmt_blocks( - args.dp->i_mount, valuelen); - retval = xfs_attr_rmtval_get(&args); - if (!retval) - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); - kmem_free(args.value); - } else { - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - NULL); - } - if (retval) - return retval; + xfs_attr_leaf_name_remote_t *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + valuelen = be32_to_cpu(name_rmt->valuelen); } + + retval = context->put_listent(context, entry->flags, + name, namelen, valuelen); + if (retval) + break; if (context->seen_enough) break; cursor->offset++; @@ -551,8 +519,7 @@ xfs_attr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { struct attrlist *alist = (struct attrlist *)context->alist; attrlist_ent_t *aep; @@ -581,7 +548,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 1; + return 0; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a32c1dcae2ff..586bb64e674b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -72,18 +72,11 @@ xfs_zero_extent( struct xfs_mount *mp = ip->i_mount; xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); sector_t block = XFS_BB_TO_FSBT(mp, sector); - ssize_t size = XFS_FSB_TO_B(mp, count_fsb); - - if (IS_DAX(VFS_I(ip))) - return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), - sector, size); - - /* - * let the block layer decide on the fastest method of - * implementing the zeroing. - */ - return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + block << (mp->m_super->s_blocksize_bits - 9), + count_fsb << (mp->m_super->s_blocksize_bits - 9), + GFP_NOFS, true); } /* @@ -900,19 +893,15 @@ xfs_free_eofblocks( * Free them up now by truncating the file to * its current size. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) return -EAGAIN; - } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -1037,9 +1026,9 @@ xfs_alloc_file_space( /* * Allocate and setup the transaction. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + resrtextents, 0, &tp); + /* * Check for running out of space */ @@ -1048,7 +1037,6 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1237,7 +1225,7 @@ xfs_free_file_space( /* wait for the completion of any pending DIOs */ inode_dio_wait(VFS_I(ip)); - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); ioffset = round_down(offset, rounding); iendoffset = round_up(offset + len, rounding) - 1; error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, @@ -1311,18 +1299,10 @@ xfs_free_file_space( * transaction to dip into the reserve blocks to ensure * the freeing of the space succeeds at ENOSPC. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); - - /* - * check for running out of space - */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, + &tp); if (error) { - /* - * Free the transaction structure. - */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1466,7 +1446,7 @@ xfs_shift_file_space( if (error) return error; error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - offset >> PAGE_CACHE_SHIFT, -1); + offset >> PAGE_SHIFT, -1); if (error) return error; @@ -1482,19 +1462,16 @@ xfs_shift_file_space( } while (!error && !done) { - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* * We would need to reserve permanent block for transaction. * This will come into picture when after shifting extent into * hole we found that adjacent extents can be merged which * may lead to freeing of a block during record update. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) break; - } xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, @@ -1747,12 +1724,9 @@ xfs_swap_extents( if (error) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_unlock; - } /* * Lock and join the inodes to the tansaction so that transaction commit diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9a2191b91137..e71cfbd5acb3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1100,22 +1100,18 @@ xfs_bwrite( return error; } -STATIC void +static void xfs_buf_bio_end_io( struct bio *bio) { - xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; /* * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) { - spin_lock(&bp->b_lock); - if (!bp->b_io_error) - bp->b_io_error = bio->bi_error; - spin_unlock(&bp->b_lock); - } + if (bio->bi_error) + cmpxchg(&bp->b_io_error, 0, bio->bi_error); if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4eb89bd4ee73..8bfb974f0772 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -183,6 +183,26 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ int b_error; /* error code on I/O */ + + /* + * async write failure retry count. Initialised to zero on the first + * failure, then when it exceeds the maximum configured without a + * success the write is considered to be failed permanently and the + * iodone handler will take appropriate action. + * + * For retry timeouts, we record the jiffie of the first failure. This + * means that we can change the retry timeout for buffers already under + * I/O and thus avoid getting stuck in a retry loop with a long timeout. + * + * last_error is used to ensure that we are getting repeated errors, not + * different errors. e.g. a block device might change ENOSPC to EIO when + * a failure timeout occurs, so we want to re-initialise the error + * retry behaviour appropriately when that happens. + */ + int b_retries; + unsigned long b_first_retry_time; /* in jiffies */ + int b_last_error; + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 99e91a0e554e..34257992934c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1042,35 +1042,22 @@ xfs_buf_do_callbacks( } } -/* - * This is the iodone() function for buffers which have had callbacks - * attached to them by xfs_buf_attach_iodone(). It should remove each - * log item from the buffer's list and call the callback of each in turn. - * When done, the buffer's fsprivate field is set to NULL and the buffer - * is unlocked with a call to iodone(). - */ -void -xfs_buf_iodone_callbacks( +static bool +xfs_buf_iodone_callback_error( struct xfs_buf *bp) { struct xfs_log_item *lip = bp->b_fspriv; struct xfs_mount *mp = lip->li_mountp; static ulong lasttime; static xfs_buftarg_t *lasttarg; - - if (likely(!bp->b_error)) - goto do_callbacks; + struct xfs_error_cfg *cfg; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_item_iodone(bp, _RET_IP_); - goto do_callbacks; - } + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { @@ -1079,45 +1066,93 @@ xfs_buf_iodone_callbacks( } lasttarg = bp->b_target; + /* synchronous writes will have callers process the error */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + ASSERT(bp->b_iodone != NULL); + /* * If the write was asynchronous then no one will be looking for the - * error. Clear the error state and write the buffer out again. - * - * XXX: This helps against transient write errors, but we need to find - * a way to shut the filesystem down if the writes keep failing. - * - * In practice we'll shut the filesystem down soon as non-transient - * errors tend to affect the whole device and a failing log write - * will make us give up. But we really ought to do better here. + * error. If this is the first failure of this type, clear the error + * state and write the buffer out again. This means we always retry an + * async write failure at least once, but we also need to set the buffer + * up to behave correctly now for repeated failures. */ - if (bp->b_flags & XBF_ASYNC) { - ASSERT(bp->b_iodone != NULL); + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || + bp->b_last_error != bp->b_error) { + bp->b_flags |= (XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL); + bp->b_last_error = bp->b_error; + bp->b_retries = 0; + bp->b_first_retry_time = jiffies; + + xfs_buf_ioerror(bp, 0); + xfs_buf_submit(bp); + return true; + } - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + /* + * Repeated failure on an async write. Take action according to the + * error configuration we have been set up to use. + */ + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); - xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + goto permanent_error; + if (cfg->retry_timeout && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + goto permanent_error; - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL; - xfs_buf_submit(bp); - } else { - xfs_buf_relse(bp); - } + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + goto permanent_error; - return; - } + /* still a transient error, higher layers will retry */ + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; /* - * If the write of the buffer was synchronous, we want to make - * sure to return the error to the caller of xfs_bwrite(). + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. */ +permanent_error: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); +out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +/* + * This is the iodone() function for buffers which have had callbacks attached + * to them by xfs_buf_attach_iodone(). We need to iterate the items on the + * callback list, mark the buffer as having no more callbacks and then push the + * buffer through IO completion processing. + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + /* + * If there is an error, process it. Some errors require us + * to run callbacks after failure processing is done so we + * detect that and take appropriate action. + */ + if (bp->b_error && xfs_buf_iodone_callback_error(bp)) + return; + + /* + * Successful IO or permanent error. Either way, we can clear the + * retry state here in preparation for the next error that may occur. + */ + bp->b_last_error = 0; + bp->b_retries = 0; -do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 93b3ab0c5435..f44f79996978 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -273,10 +273,11 @@ xfs_dir2_leaf_readbuf( size_t bufsize, struct xfs_dir2_leaf_map_info *mip, xfs_dir2_off_t *curoff, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + bool trim_map) { struct xfs_inode *dp = args->dp; - struct xfs_buf *bp = *bpp; + struct xfs_buf *bp = NULL; struct xfs_bmbt_irec *map = mip->map; struct blk_plug plug; int error = 0; @@ -286,13 +287,10 @@ xfs_dir2_leaf_readbuf( struct xfs_da_geometry *geo = args->geo; /* - * If we have a buffer, we need to release it and - * take it out of the mapping. + * If the caller just finished processing a buffer, it will tell us + * we need to trim that block out of the mapping now it is done. */ - - if (bp) { - xfs_trans_brelse(NULL, bp); - bp = NULL; + if (trim_map) { mip->map_blocks -= geo->fsbcount; /* * Loop to get rid of the extents for the @@ -533,10 +531,17 @@ xfs_dir2_leaf_getdents( */ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { int lock_mode; + bool trim_map = false; + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + trim_map = true; + } lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, - &curoff, &bp); + &curoff, &bp, trim_map); xfs_iunlock(dp, lock_mode); if (error || !map_info->map_valid) break; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 316b2a1bdba5..e0646659ce16 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -614,11 +614,10 @@ xfs_qm_dqread( trace_xfs_dqread(dqp); if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); if (error) - goto error1; + goto error0; } /* @@ -692,7 +691,7 @@ error0: * end of the chunk, skip ahead to first id in next allocated chunk * using the SEEK_DATA interface. */ -int +static int xfs_dq_get_next_id( xfs_mount_t *mp, uint type, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ac0fd32de31e..47fc63295422 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -106,8 +106,8 @@ xfs_iozero( unsigned offset, bytes; void *fsdata; - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - bytes = PAGE_CACHE_SIZE - offset; + offset = (pos & (PAGE_SIZE -1)); /* Within page */ + bytes = PAGE_SIZE - offset; if (bytes > count) bytes = count; @@ -145,12 +145,10 @@ xfs_update_prealloc_flags( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); - error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -718,18 +716,19 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - loff_t pos = iocb->ki_pos; loff_t end; struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) + if (!IS_DAX(inode) && + ((iocb->ki_pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ - if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) unaligned_io = 1; /* @@ -760,8 +759,7 @@ xfs_file_dio_aio_write( if (ret) goto out; count = iov_iter_count(from); - pos = iocb->ki_pos; - end = pos + count - 1; + end = iocb->ki_pos + count - 1; /* * See xfs_file_read_iter() for why we do a full-file flush here. @@ -794,19 +792,18 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data, pos); + ret = mapping->a_ops->direct_IO(iocb, &data); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, - pos >> PAGE_CACHE_SHIFT, - end >> PAGE_CACHE_SHIFT); + iocb->ki_pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); } if (ret > 0) { - pos += ret; + iocb->ki_pos += ret; iov_iter_advance(from, ret); - iocb->ki_pos = pos; } out: xfs_rw_iunlock(ip, iolock); @@ -904,14 +901,10 @@ xfs_file_write_iter( ret = xfs_file_buffered_aio_write(iocb, from); if (ret > 0) { - ssize_t err; - XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; + ret = generic_write_sync(iocb, ret); } return ret; } @@ -1207,9 +1200,9 @@ xfs_find_get_desired_pgoff( pagevec_init(&pvec, 0); - index = startoff >> PAGE_CACHE_SHIFT; + index = startoff >> PAGE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_SHIFT; do { int want; unsigned nr_pages; @@ -1558,7 +1551,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = block_page_mkwrite_return(ret); @@ -1592,7 +1585,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1629,8 +1622,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, - NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) @@ -1714,7 +1706,7 @@ const struct file_operations xfs_file_operations = { const struct file_operations xfs_dir_file_operations = { .open = xfs_dir_open, .read = generic_read_dir, - .iterate = xfs_file_readdir, + .iterate_shared = xfs_file_readdir, .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ee3aaa0a5317..b4d75825ae37 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -198,14 +198,10 @@ xfs_growfs_data_private( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, - XFS_GROWFS_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } /* * Write new AG headers to disk. Non-transactional, but written @@ -243,8 +239,8 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - agf->agf_flfirst = 0; - agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; agf->agf_flcount = 0; tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bf2d60749278..99ee6eee5e0b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,9 +37,6 @@ #include <linux/kthread.h> #include <linux/freezer.h> -STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, - struct xfs_perag *pag, struct xfs_inode *ip); - /* * Allocate and initialise an xfs_inode. */ @@ -94,13 +91,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: @@ -118,6 +108,25 @@ xfs_inode_free( ip->i_itemp = NULL; } + kmem_zone_free(xfs_inode_zone, ip); +} + +static void +__xfs_inode_free( + struct xfs_inode *ip) +{ + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(ip->i_mount, vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -129,12 +138,123 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(ip->i_mount, vn_active); + __xfs_inode_free(ip); +} - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +static void +xfs_perag_set_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (pag->pag_ici_reclaimable++) + return; + + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(mp); + + trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + +static void +xfs_perag_clear_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (--pag->pag_ici_reclaimable) + return; + + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_set_reclaim_tag(pag); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +xfs_inode_clear_reclaim_tag( + struct xfs_perag *pag, + xfs_ino_t ino) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(pag->pag_mount, ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_clear_reclaim_tag(pag); } /* @@ -264,7 +384,7 @@ xfs_iget_cache_hit( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); + xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); @@ -723,121 +843,6 @@ xfs_inode_ag_iterator_tag( } /* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. - */ -static void -xfs_reclaim_work_queue( - struct xfs_mount *mp) -{ - - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, - msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); - } - rcu_read_unlock(); -} - -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - -static void -__xfs_inode_set_reclaim_tag( - struct xfs_perag *pag, - struct xfs_inode *ip) -{ - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - - if (!pag->pag_ici_reclaimable) { - /* propagate the reclaim tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(ip->i_mount); - - trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } - pag->pag_ici_reclaimable++; -} - -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_set_reclaim_tag( - xfs_inode_t *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_set_reclaim_tag(pag, ip); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - -STATIC void -__xfs_inode_clear_reclaim( - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - pag->pag_ici_reclaimable--; - if (!pag->pag_ici_reclaimable) { - /* clear the reclaim tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } -} - -STATIC void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - __xfs_inode_clear_reclaim(pag, ip); -} - -/* * Grab the inode for reclaim exclusively. * Return 0 if we grabbed it, non-zero otherwise. */ @@ -929,6 +934,7 @@ xfs_reclaim_inode( int sync_mode) { struct xfs_buf *bp = NULL; + xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: @@ -993,6 +999,22 @@ restart: xfs_iflock(ip); reclaim: + /* + * Because we use RCU freeing we need to ensure the inode always appears + * to be reclaimed with an invalid inode number when in the free state. + * We do this as early as possible under the ILOCK and flush lock so + * that xfs_iflush_cluster() can be guaranteed to detect races with us + * here. By doing this, we guarantee that once xfs_iflush_cluster has + * locked both the XFS_ILOCK and the flush lock that it will see either + * a valid, flushable inode that will serialise correctly against the + * locks below, or it will see a clean (and invalid) inode that it can + * skip. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1006,9 +1028,9 @@ reclaim: */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); - __xfs_inode_clear_reclaim(pag, ip); + xfs_perag_clear_reclaim_tag(pag); spin_unlock(&pag->pag_ici_lock); /* @@ -1023,7 +1045,7 @@ reclaim: xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_inode_free(ip); + __xfs_inode_free(ip); return error; out_ifunlock: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 96f606deee31..ee6799e0476f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1030,7 +1030,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, 0); + code = xfs_trans_roll(&tp, NULL); if (committed != NULL) *committed = 1; @@ -1161,11 +1161,9 @@ xfs_create( rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_mkdir; - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { resblks = XFS_CREATE_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_create; - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } /* @@ -1174,20 +1172,19 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; - + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -1337,17 +1334,16 @@ xfs_create_tmpfile( return error; resblks = XFS_IALLOC_SPACE_RES(mp); - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); - tres = &M_RES(mp)->tr_create_tmpfile; - error = xfs_trans_reserve(tp, tres, resblks, 0); + + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1432,15 +1428,14 @@ xfs_link( if (error) goto std_return; - tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); } if (error) - goto error_return; + goto std_return; xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1710,11 +1705,9 @@ xfs_inactive_truncate( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); return error; } @@ -1764,8 +1757,6 @@ xfs_inactive_ifree( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - /* * The ifree transaction might need to allocate blocks for record * insertion to the finobt. We don't want to fail here at ENOSPC, so @@ -1781,9 +1772,8 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -1792,7 +1782,6 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp); return error; } @@ -2525,11 +2514,6 @@ xfs_remove( if (error) goto std_return; - if (is_dir) - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - else - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - /* * We try to get the real space reservation first, * allowing for directory btree deletion(s) implying @@ -2540,14 +2524,15 @@ xfs_remove( * block from the directory. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, + &tp); } if (error) { ASSERT(error != -ENOSPC); - goto out_trans_cancel; + goto std_return; } xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); @@ -2855,6 +2840,7 @@ xfs_rename_alloc_whiteout( * and flag it as linkable. */ drop_nlink(VFS_I(tmpfile)); + xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -2910,15 +2896,15 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { spaceres = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_wip; /* * Attach the dquots to the inodes @@ -3155,6 +3141,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: xfs_trans_cancel(tp); +out_release_wip: if (wip) IRELE(wip); return error; @@ -3162,16 +3149,16 @@ out_trans_cancel: STATIC int xfs_iflush_cluster( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_mount_t *mp = ip->i_mount; + struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; unsigned long first_index, mask; unsigned long inodes_per_cluster; - int ilist_size; - xfs_inode_t **ilist; - xfs_inode_t *iq; + int cilist_size; + struct xfs_inode **cilist; + struct xfs_inode *cip; int nr_found; int clcount = 0; int bufwasdelwri; @@ -3180,23 +3167,23 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); - ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); - if (!ilist) + cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); + if (!cilist) goto out_put; mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, first_index, inodes_per_cluster); if (nr_found == 0) goto out_free; for (i = 0; i < nr_found; i++) { - iq = ilist[i]; - if (iq == ip) + cip = cilist[i]; + if (cip == ip) continue; /* @@ -3205,20 +3192,30 @@ xfs_iflush_cluster( * We need to check under the i_flags_lock for a valid inode * here. Skip it if it is not valid or the wrong inode. */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino || - (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { - spin_unlock(&ip->i_flags_lock); + spin_lock(&cip->i_flags_lock); + if (!cip->i_ino || + __xfs_iflags_test(cip, XFS_ISTALE)) { + spin_unlock(&cip->i_flags_lock); continue; } - spin_unlock(&ip->i_flags_lock); + + /* + * Once we fall off the end of the cluster, no point checking + * any more inodes in the list because they will also all be + * outside the cluster. + */ + if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { + spin_unlock(&cip->i_flags_lock); + break; + } + spin_unlock(&cip->i_flags_lock); /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated * later after the appropriate locks are acquired. */ - if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) continue; /* @@ -3226,15 +3223,28 @@ xfs_iflush_cluster( * then this inode cannot be flushed and is skipped. */ - if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(cip)) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; - if (!xfs_iflock_nowait(iq)) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + if (xfs_ipincount(cip)) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } - if (xfs_ipincount(iq)) { - xfs_ifunlock(iq); - xfs_iunlock(iq, XFS_ILOCK_SHARED); + + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!cip->i_ino) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } @@ -3242,18 +3252,18 @@ xfs_iflush_cluster( * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing. */ - if (!xfs_inode_clean(iq)) { + if (!xfs_inode_clean(cip)) { int error; - error = xfs_iflush_int(iq, bp); + error = xfs_iflush_int(cip, bp); if (error) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); goto cluster_corrupt_out; } clcount++; } else { - xfs_ifunlock(iq); + xfs_ifunlock(cip); } - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); } if (clcount) { @@ -3263,7 +3273,7 @@ xfs_iflush_cluster( out_free: rcu_read_unlock(); - kmem_free(ilist); + kmem_free(cilist); out_put: xfs_perag_put(pag); return 0; @@ -3306,8 +3316,8 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq, false); - kmem_free(ilist); + xfs_iflush_abort(cip, false); + kmem_free(cilist); xfs_perag_put(pag); return -EFSCORRUPTED; } @@ -3327,7 +3337,7 @@ xfs_iflush( struct xfs_buf **bpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp; + struct xfs_buf *bp = NULL; struct xfs_dinode *dip; int error; @@ -3369,14 +3379,22 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. + * Get the buffer containing the on-disk inode. We are doing a try-lock + * operation here, so we may get an EAGAIN error. In that case, we + * simply want to return with the inode still dirty. + * + * If we get any other error, we effectively have a corruption situation + * and we cannot flush the inode, so we treat it the same as failing + * xfs_iflush_int(). */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 0); - if (error || !bp) { + if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } + if (error) + goto corrupt_out; /* * First flush out the inode that xfs_iflush was called with. @@ -3404,7 +3422,8 @@ xfs_iflush( return 0; corrupt_out: - xfs_buf_relse(bp); + if (bp) + xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: error = -EFSCORRUPTED; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 43e1d51b15eb..e52d7c7aeb5b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -440,6 +440,9 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, /* from xfs_iops.c */ +extern void xfs_setup_inode(struct xfs_inode *ip); +extern void xfs_setup_iops(struct xfs_inode *ip); + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at @@ -447,7 +450,6 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, * before we've completed instantiation. Otherwise we can do it * the moment the inode lookup is complete. */ -extern void xfs_setup_inode(struct xfs_inode *ip); static inline void xfs_finish_inode_setup(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_INEW); @@ -458,6 +460,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) static inline void xfs_setup_existing_inode(struct xfs_inode *ip) { xfs_setup_inode(ip); + xfs_setup_iops(ip); xfs_finish_inode_setup(ip); } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c48b5b18d771..a1b07612224c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork( */ data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_real_bytes == 0 || - ip->i_df.if_real_bytes == data_bytes); + ip->i_df.if_real_bytes >= data_bytes); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, @@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork( */ data_bytes = roundup(ip->i_afp->if_bytes, 4); ASSERT(ip->i_afp->if_real_bytes == 0 || - ip->i_afp->if_real_bytes == data_bytes); + ip->i_afp->if_real_bytes >= data_bytes); ASSERT(ip->i_afp->if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, ip->i_afp->if_u1.if_data, @@ -479,6 +479,8 @@ STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) + __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcb6c19ce3ea..dbca7375deef 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -277,7 +277,6 @@ xfs_readlink_by_handle( { struct dentry *dentry; __u32 olen; - void *link; int error; if (!capable(CAP_SYS_ADMIN)) @@ -288,7 +287,7 @@ xfs_readlink_by_handle( return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!d_is_symlink(dentry)) { + if (!d_inode(dentry)->i_op->readlink) { error = -EINVAL; goto out_dput; } @@ -298,21 +297,8 @@ xfs_readlink_by_handle( goto out_dput; } - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) { - error = -ENOMEM; - goto out_dput; - } - - error = xfs_readlink(XFS_I(d_inode(dentry)), link); - if (error) - goto out_kfree; - error = readlink_copy(hreq->ohandle, olen, link); - if (error) - goto out_kfree; + error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen); - out_kfree: - kfree(link); out_dput: dput(dentry); return error; @@ -334,12 +320,10 @@ xfs_set_dmattrs( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -1141,10 +1125,9 @@ xfs_ioctl_setattr_get_trans( if (XFS_FORCED_SHUTDOWN(mp)) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_cancel; + return ERR_PTR(error); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d81bdc080370..58391355a44d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -132,6 +132,7 @@ xfs_iomap_write_direct( int error; int lockmode; int bmapi_flags = XFS_BMAPI_PREALLOC; + uint tflags = 0; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); @@ -192,11 +193,6 @@ xfs_iomap_write_direct( return error; /* - * Allocate and setup the transaction - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - - /* * For DAX, we do not allocate unwritten extents, but instead we zero * the block before we commit the transaction. Ideally we'd like to do * this outside the transaction context, but if we commit and then crash @@ -209,23 +205,17 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (ISUNWRITTEN(imap)) { - tp->t_flags |= XFS_TRANS_RESERVE; + tflags |= XFS_TRANS_RESERVE; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); - /* - * Check for running out of space, note: need lock to return - */ - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, + tflags, &tp); + if (error) return error; - } lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); @@ -726,15 +716,13 @@ xfs_iomap_write_allocate( nimaps = 0; while (nimaps == 0) { - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - nres, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres, + 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -878,25 +866,18 @@ xfs_iomap_write_unwritten( do { /* - * set up a transaction to convert the range of extents + * Set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. * - * Note that we open code the transaction allocation here - * to pass KM_NOFS--we can't risk to recursing back into - * the filesystem here as we might be asked to write out - * the same inode that we complete here and might deadlock - * on the iolock. + * Note that we can't risk to recursing back into the filesystem + * here as we might be asked to write out the same inode that we + * complete here and might deadlock on the iolock. */ - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fb7dc61f4a29..c5d4eba6972e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -181,6 +181,8 @@ xfs_generic_create( } #endif + xfs_setup_iops(ip); + if (tmpfile) d_tmpfile(dentry, inode); else @@ -368,6 +370,8 @@ xfs_vn_symlink( if (unlikely(error)) goto out_cleanup_inode; + xfs_setup_iops(cip); + d_instantiate(dentry, inode); xfs_finish_inode_setup(cip); return 0; @@ -442,6 +446,16 @@ xfs_vn_get_link( return ERR_PTR(error); } +STATIC const char * +xfs_vn_get_link_inline( + struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); + return XFS_I(inode)->i_df.if_u1.if_data; +} + STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -599,12 +613,12 @@ xfs_setattr_nonsize( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + goto out_dqrele; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -633,12 +647,10 @@ xfs_setattr_nonsize( NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ - goto out_unlock; + goto out_cancel; } } - xfs_trans_ijoin(tp, ip, 0); - /* * Change file ownership. Must be the owner or privileged. */ @@ -722,10 +734,9 @@ xfs_setattr_nonsize( return 0; -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out_trans_cancel: +out_cancel: xfs_trans_cancel(tp); +out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; @@ -834,7 +845,7 @@ xfs_setattr_size( * We have to do all the page cache truncate work outside the * transaction context as the "lock" order is page lock->log space * reservation as defined by extent allocation in the writeback path. - * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but * having already truncated the in-memory version of the file (i.e. made * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate @@ -849,10 +860,9 @@ xfs_setattr_size( return error; truncate_setsize(inode, newsize); - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -971,12 +981,9 @@ xfs_vn_update_time( trace_xfs_update_time(ip); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); if (flags & S_CTIME) @@ -1167,6 +1174,18 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; +static const struct inode_operations xfs_inline_symlink_inode_operations = { + .readlink = generic_readlink, + .get_link = xfs_vn_get_link_inline, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, +}; + STATIC void xfs_diflags_to_iflags( struct inode *inode, @@ -1193,7 +1212,7 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode and set up the operation vectors. + * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, * when creating a new inode it is called from xfs_ialloc after setting up the @@ -1232,32 +1251,12 @@ xfs_setup_inode( i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); - ip->d_ops = ip->i_mount->m_nondir_inode_ops; - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &xfs_inode_operations; - inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - case S_IFDIR: + if (S_ISDIR(inode->i_mode)) { lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); - if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) - inode->i_op = &xfs_dir_ci_inode_operations; - else - inode->i_op = &xfs_dir_inode_operations; - inode->i_fop = &xfs_dir_file_operations; ip->d_ops = ip->i_mount->m_dir_inode_ops; - break; - case S_IFLNK: - inode->i_op = &xfs_symlink_inode_operations; - if (!(ip->i_df.if_flags & XFS_IFINLINE)) - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - default: - inode->i_op = &xfs_inode_operations; - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; + } else { + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); } /* @@ -1277,3 +1276,35 @@ xfs_setup_inode( cache_no_acl(inode); } } + +void +xfs_setup_iops( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + break; + case S_IFLNK: + if (ip->i_df.if_flags & XFS_IFINLINE) + inode->i_op = &xfs_inline_symlink_inode_operations; + else + inode->i_op = &xfs_symlink_inode_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } +} diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ec0e239a0fa9..a8192dc797dc 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -135,7 +135,7 @@ typedef __u32 xfs_nlink_t; * Size of block device i/o is parameterized here. * Currently the system supports page-sized i/o. */ -#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT +#define BLKDEV_IOSHIFT PAGE_SHIFT #define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT) /* number of BB's per block device block */ #define BLKDEV_BB BTOBB(BLKDEV_IOSIZE) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b49ccf5c1d75..bde02f1fba73 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -435,8 +435,7 @@ xfs_log_reserve( int cnt, struct xlog_ticket **ticp, __uint8_t client, - bool permanent, - uint t_type) + bool permanent) { struct xlog *log = mp->m_log; struct xlog_ticket *tic; @@ -456,7 +455,6 @@ xfs_log_reserve( if (!tic) return -ENOMEM; - tic->t_trans_type = t_type; *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - error = xfs_log_reserve(mp, 600, 1, &tic, - XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); + error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); if (!error) { /* the data section must be 32 bit size aligned */ struct { @@ -2032,58 +2029,8 @@ xlog_print_tic_res( REG_TYPE_STR(ICREATE, "inode create") }; #undef REG_TYPE_STR -#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type - static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { - TRANS_TYPE_STR(SETATTR_NOT_SIZE), - TRANS_TYPE_STR(SETATTR_SIZE), - TRANS_TYPE_STR(INACTIVE), - TRANS_TYPE_STR(CREATE), - TRANS_TYPE_STR(CREATE_TRUNC), - TRANS_TYPE_STR(TRUNCATE_FILE), - TRANS_TYPE_STR(REMOVE), - TRANS_TYPE_STR(LINK), - TRANS_TYPE_STR(RENAME), - TRANS_TYPE_STR(MKDIR), - TRANS_TYPE_STR(RMDIR), - TRANS_TYPE_STR(SYMLINK), - TRANS_TYPE_STR(SET_DMATTRS), - TRANS_TYPE_STR(GROWFS), - TRANS_TYPE_STR(STRAT_WRITE), - TRANS_TYPE_STR(DIOSTRAT), - TRANS_TYPE_STR(WRITEID), - TRANS_TYPE_STR(ADDAFORK), - TRANS_TYPE_STR(ATTRINVAL), - TRANS_TYPE_STR(ATRUNCATE), - TRANS_TYPE_STR(ATTR_SET), - TRANS_TYPE_STR(ATTR_RM), - TRANS_TYPE_STR(ATTR_FLAG), - TRANS_TYPE_STR(CLEAR_AGI_BUCKET), - TRANS_TYPE_STR(SB_CHANGE), - TRANS_TYPE_STR(DUMMY1), - TRANS_TYPE_STR(DUMMY2), - TRANS_TYPE_STR(QM_QUOTAOFF), - TRANS_TYPE_STR(QM_DQALLOC), - TRANS_TYPE_STR(QM_SETQLIM), - TRANS_TYPE_STR(QM_DQCLUSTER), - TRANS_TYPE_STR(QM_QINOCREATE), - TRANS_TYPE_STR(QM_QUOTAOFF_END), - TRANS_TYPE_STR(FSYNC_TS), - TRANS_TYPE_STR(GROWFSRT_ALLOC), - TRANS_TYPE_STR(GROWFSRT_ZERO), - TRANS_TYPE_STR(GROWFSRT_FREE), - TRANS_TYPE_STR(SWAPEXT), - TRANS_TYPE_STR(CHECKPOINT), - TRANS_TYPE_STR(ICREATE), - TRANS_TYPE_STR(CREATE_TMPFILE) - }; -#undef TRANS_TYPE_STR xfs_warn(mp, "xlog_write: reservation summary:"); - xfs_warn(mp, " trans type = %s (%u)", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? - "bad-trans-type" : trans_type_str[ticket->t_trans_type]), - ticket->t_trans_type); xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); xfs_warn(mp, " current res = %d bytes", @@ -3378,7 +3325,7 @@ xfs_log_force( { int error; - trace_xfs_log_force(mp, 0); + trace_xfs_log_force(mp, 0, _RET_IP_); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3527,7 +3474,7 @@ xfs_log_force_lsn( { int error; - trace_xfs_log_force(mp, lsn); + trace_xfs_log_force(mp, lsn, _RET_IP_); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3709,7 +3656,6 @@ xlog_ticket_alloc( tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; - tic->t_trans_type = 0; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index aa533a7d50f2..80ba0c047090 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -161,8 +161,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int count, struct xlog_ticket **ticket, __uint8_t clientid, - bool permanent, - uint t_type); + bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4e7649351f5a..5e54e7955ea6 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -51,7 +51,6 @@ xlog_cil_ticket_alloc( tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, KM_SLEEP|KM_NOFS); - tic->t_trans_type = XFS_TRANS_CHECKPOINT; /* * set the current reservation to zero so we know to steal the basic diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ed8896310c00..765f084759b5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -175,7 +175,6 @@ typedef struct xlog_ticket { char t_cnt; /* current count : 1 */ char t_clientid; /* who does this belong to; : 1 */ char t_flags; /* properties of reservation : 1 */ - uint t_trans_type; /* transaction type : 4 */ /* reservation array fields */ uint t_res_num; /* num in array : 4 */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 396565f43247..835997843846 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -4205,10 +4205,9 @@ xlog_recover_process_efi( } } - tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto abort_error; + return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { @@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket( int offset; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) - goto out_abort; + goto out_error; error = xfs_read_agi(mp, tp, agno, &agibp); if (error) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 536a0ee9cd5a..e39b02351b4a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -89,7 +89,6 @@ xfs_uuid_mount( if (hole < 0) { xfs_uuid_table = kmem_realloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - xfs_uuid_table_size * sizeof(*xfs_uuid_table), KM_SLEEP); hole = xfs_uuid_table_size++; } @@ -171,7 +170,7 @@ xfs_sb_validate_fsb_count( ASSERT(sbp->sb_blocklog >= BBSHIFT); /* Limited by ULONG_MAX of page cache index */ - if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) return -EFBIG; return 0; } @@ -681,6 +680,9 @@ xfs_mountfs( xfs_set_maxicount(mp); + /* enable fail_at_unmount as default */ + mp->m_fail_unmount = 1; + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) goto out; @@ -690,10 +692,15 @@ xfs_mountfs( if (error) goto out_remove_sysfs; - error = xfs_uuid_mount(mp); + error = xfs_error_sysfs_init(mp); if (error) goto out_del_stats; + + error = xfs_uuid_mount(mp); + if (error) + goto out_remove_error_sysfs; + /* * Set the minimum read and write sizes */ @@ -957,6 +964,7 @@ xfs_mountfs( cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: + mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -968,6 +976,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_remove_error_sysfs: + xfs_error_sysfs_del(mp); out_del_stats: xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: @@ -1006,6 +1016,14 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* + * We now need to tell the world we are unmounting. This will allow + * us to detect that the filesystem is going away and we should error + * out anything that we have been retrying in the background. This will + * prevent neverending retries in AIL pushing from hanging the unmount. + */ + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + /* * Flush all pending changes from the AIL. */ xfs_ail_push_all_sync(mp->m_ail); @@ -1056,6 +1074,7 @@ xfs_unmountfs( #endif xfs_free_perag(mp); + xfs_error_sysfs_del(mp); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bac6b3435591..c1b798c72126 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -37,6 +37,32 @@ enum { XFS_LOWSP_MAX, }; +/* + * Error Configuration + * + * Error classes define the subsystem the configuration belongs to. + * Error numbers define the errors that are configurable. + */ +enum { + XFS_ERR_METADATA, + XFS_ERR_CLASS_MAX, +}; +enum { + XFS_ERR_DEFAULT, + XFS_ERR_EIO, + XFS_ERR_ENOSPC, + XFS_ERR_ENODEV, + XFS_ERR_ERRNO_MAX, +}; + +#define XFS_ERR_RETRY_FOREVER -1 + +struct xfs_error_cfg { + struct xfs_kobj kobj; + int max_retries; + unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ +}; + typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ @@ -127,6 +153,9 @@ typedef struct xfs_mount { int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ struct xfs_kobj m_kobj; + struct xfs_kobj m_error_kobj; + struct xfs_kobj m_error_meta_kobj; + struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; @@ -148,6 +177,7 @@ typedef struct xfs_mount { */ __uint32_t m_generation; + bool m_fail_unmount; #ifdef DEBUG /* * DEBUG mode instrumentation to test and/or trigger delayed allocation @@ -166,6 +196,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -231,12 +262,12 @@ static inline unsigned long xfs_preferred_iosize(xfs_mount_t *mp) { if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE) - return PAGE_CACHE_SIZE; + return PAGE_SIZE; return (mp->m_swidth ? (mp->m_swidth << mp->m_sb.sb_blocklog) : ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ? (1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) : - PAGE_CACHE_SIZE)); + PAGE_SIZE)); } #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ @@ -364,4 +395,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, xfs_off_t count_fsb); +struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, + int error_class, int error); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index ade236e90bb3..d5b756669fb5 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -293,8 +293,8 @@ xfs_fs_commit_blocks( * Make sure reads through the pagecache see the new data. */ error = invalidate_inode_pages2_range(inode->i_mapping, - start >> PAGE_CACHE_SHIFT, - (end - 1) >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT, + (end - 1) >> PAGE_SHIFT); WARN_ON_ONCE(error); error = xfs_iomap_write_unwritten(ip, start, length); @@ -308,12 +308,9 @@ xfs_fs_commit_blocks( goto out_drop_iolock; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_drop_iolock; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be125e1758c1..a60d9e2739d1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -783,13 +783,10 @@ xfs_qm_qino_alloc( } } - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, - XFS_QM_QINOCREATE_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); + if (error) return error; - } if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index f4d0e0a8f517..475a3882a81f 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_IOLOCK_EXCL); - tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -436,12 +434,9 @@ xfs_qm_scall_setqlim( defq = xfs_get_defquota(dqp, q); xfs_dqunlock(dqp); - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); + if (error) goto out_rele; - } xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); @@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end( int error; xfs_qoff_logitem_t *qoffi; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); + if (error) return error; - } qoffi = xfs_trans_get_qoff_item(tp, startqoff, flags & XFS_ALL_QUOTA_ACCT); @@ -603,12 +594,9 @@ xfs_qm_log_quotaoff( *qoffstartp = NULL; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); + if (error) goto out; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index abf44435d04a..3938b37d1043 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,15 +780,14 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* * Reserve space & log for one extent added to the file. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, - resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks, + 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the inode. */ @@ -823,14 +822,13 @@ xfs_growfs_rt_alloc( for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); /* * Reserve log for one block zeroing. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, + 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the bitmap inode. */ @@ -994,11 +992,10 @@ xfs_growfs_rt( /* * Start a transaction, get the log reservation. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0, + &tp); if (error) - goto error_cancel; + break; /* * Lock out other callers by grabbing the bitmap inode lock. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d760934109b5..11ea5d51db56 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -58,8 +58,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -static kmem_zone_t *xfs_ioend_zone; -mempool_t *xfs_ioend_pool; +struct bio_set *xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -350,6 +349,7 @@ xfs_parseargs( case Opt_pqnoenforce: mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); mp->m_qflags &= ~XFS_PQUOTA_ENFD; + break; case Opt_gquota: case Opt_grpquota: mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | @@ -556,10 +556,10 @@ xfs_max_file_offset( /* Figure out maximum filesize, on Linux this can depend on * the filesystem blocksize (on 32 bit platforms). * __block_write_begin does this in an [unsigned] long... - * page->index << (PAGE_CACHE_SHIFT - bbits) + * page->index << (PAGE_SHIFT - bbits) * So, for page sized blocks (4K on 32 bit platforms), * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) * but for smaller blocksizes it is less (bbits = log2 bsize). * Note1: get_block_t takes a long (implicit cast from above) * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch @@ -570,10 +570,10 @@ xfs_max_file_offset( #if BITS_PER_LONG == 32 # if defined(CONFIG_LBDAF) ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_CACHE_SIZE; + pagefactor = PAGE_SIZE; bitshift = BITS_PER_LONG; # else - pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); + pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift); # endif #endif @@ -928,7 +928,7 @@ xfs_fs_alloc_inode( /* * Now that the generic code is guaranteed not to be accessing - * the linux inode, we can reclaim the inode. + * the linux inode, we can inactivate and reclaim the inode. */ STATIC void xfs_fs_destroy_inode( @@ -938,9 +938,14 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - XFS_STATS_INC(ip->i_mount, vn_reclaim); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + XFS_STATS_INC(ip->i_mount, vn_rele); + XFS_STATS_INC(ip->i_mount, vn_remove); + + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + XFS_STATS_INC(ip->i_mount, vn_reclaim); /* * We should never get here with one of the reclaim flags already set. @@ -987,24 +992,6 @@ xfs_fs_inode_init_once( "xfsino", ip->i_ino); } -STATIC void -xfs_fs_evict_inode( - struct inode *inode) -{ - xfs_inode_t *ip = XFS_I(inode); - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - - trace_xfs_evict_inode(ip); - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - XFS_STATS_INC(ip->i_mount, vn_rele); - XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); -} - /* * We do an unlocked check for XFS_IDONTCACHE here because we are already * serialised against cache hits here via the inode->i_lock and igrab() in @@ -1276,6 +1263,16 @@ xfs_fs_remount( return -EINVAL; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1558,14 +1555,12 @@ xfs_fs_fill_super( if (mp->m_flags & XFS_MOUNT_DAX) { xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - if (sb->s_blocksize != PAGE_SIZE) { - xfs_alert(mp, - "Filesystem block size invalid for DAX Turning DAX off."); - mp->m_flags &= ~XFS_MOUNT_DAX; - } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + + error = bdev_dax_supported(sb, sb->s_blocksize); + if (error) { xfs_alert(mp, - "Block device does not support DAX Turning DAX off."); + "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } } @@ -1663,7 +1658,6 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, - .evict_inode = xfs_fs_evict_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, @@ -1688,20 +1682,15 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); - if (!xfs_ioend_zone) + xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + offsetof(struct xfs_ioend, io_inline_bio)); + if (!xfs_ioend_bioset) goto out; - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, - xfs_ioend_zone); - if (!xfs_ioend_pool) - goto out_destroy_ioend_zone; - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), "xfs_log_ticket"); if (!xfs_log_ticket_zone) - goto out_destroy_ioend_pool; + goto out_free_ioend_bioset; xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), "xfs_bmap_free_item"); @@ -1797,10 +1786,8 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); - out_destroy_ioend_pool: - mempool_destroy(xfs_ioend_pool); - out_destroy_ioend_zone: - kmem_zone_destroy(xfs_ioend_zone); + out_free_ioend_bioset: + bioset_free(xfs_ioend_bioset); out: return -ENOMEM; } @@ -1826,9 +1813,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - mempool_destroy(xfs_ioend_pool); - kmem_zone_destroy(xfs_ioend_zone); - + bioset_free(xfs_ioend_bioset); } STATIC int __init diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index b44284c1adda..08a46c6181fd 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -131,6 +131,8 @@ xfs_readlink( trace_xfs_readlink(ip); + ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE)); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -150,12 +152,7 @@ xfs_readlink( } - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } + error = xfs_readlink_bmap(ip, link); out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -221,7 +218,6 @@ xfs_symlink( if (error) return error; - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -231,13 +227,15 @@ xfs_symlink( else fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); if (error == -ENOSPC && fs_blocks == 0) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -302,19 +300,11 @@ xfs_symlink( * If the symlink will fit into the inode, write it inline. */ if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; + xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); + ip->i_d.di_size = pathlen; ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - } else { int offset; @@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt( */ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 6ced4f143494..4c2c55086208 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -17,10 +17,11 @@ */ #include "xfs.h" -#include "xfs_sysfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_sysfs.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_stats.h" @@ -362,3 +363,291 @@ struct kobj_type xfs_log_ktype = { .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_log_attrs, }; + +/* + * Metadata IO error configuration + * + * The sysfs structure here is: + * ...xfs/<dev>/error/<class>/<errno>/<error_attrs> + * + * where <class> allows us to discriminate between data IO and metadata IO, + * and any other future type of IO (e.g. special inode or directory error + * handling) we care to support. + */ +static inline struct xfs_error_cfg * +to_error_cfg(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_error_cfg, kobj); +} + +static inline struct xfs_mount * +err_to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_mount, m_error_kobj); +} + +static ssize_t +max_retries_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); +} + +static ssize_t +max_retries_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < -1) + return -EINVAL; + + cfg->max_retries = val; + return count; +} +XFS_SYSFS_ATTR_RW(max_retries); + +static ssize_t +retry_timeout_seconds_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%ld\n", + jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); +} + +static ssize_t +retry_timeout_seconds_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + /* 1 day timeout maximum */ + if (val < 0 || val > 86400) + return -EINVAL; + + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + return count; +} +XFS_SYSFS_ATTR_RW(retry_timeout_seconds); + +static ssize_t +fail_at_unmount_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_mount *mp = err_to_mp(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount); +} + +static ssize_t +fail_at_unmount_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = err_to_mp(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 1) + return -EINVAL; + + mp->m_fail_unmount = val; + return count; +} +XFS_SYSFS_ATTR_RW(fail_at_unmount); + +static struct attribute *xfs_error_attrs[] = { + ATTR_LIST(max_retries), + ATTR_LIST(retry_timeout_seconds), + NULL, +}; + + +struct kobj_type xfs_error_cfg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_error_attrs, +}; + +struct kobj_type xfs_error_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, +}; + +/* + * Error initialization tables. These need to be ordered in the same + * order as the enums used to index the array. All class init tables need to + * define a "default" behaviour as the first entry, all other entries can be + * empty. + */ +struct xfs_error_init { + char *name; + int max_retries; + int retry_timeout; /* in seconds */ +}; + +static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { + { .name = "default", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "EIO", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENOSPC", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENODEV", + .max_retries = 0, + }, +}; + +static int +xfs_error_sysfs_init_class( + struct xfs_mount *mp, + int class, + const char *parent_name, + struct xfs_kobj *parent_kobj, + const struct xfs_error_init init[]) +{ + struct xfs_error_cfg *cfg; + int error; + int i; + + ASSERT(class < XFS_ERR_CLASS_MAX); + + error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype, + &mp->m_error_kobj, parent_name); + if (error) + return error; + + for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) { + cfg = &mp->m_error_cfg[class][i]; + error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype, + parent_kobj, init[i].name); + if (error) + goto out_error; + + cfg->max_retries = init[i].max_retries; + cfg->retry_timeout = msecs_to_jiffies( + init[i].retry_timeout * MSEC_PER_SEC); + } + return 0; + +out_error: + /* unwind the entries that succeeded */ + for (i--; i >= 0; i--) { + cfg = &mp->m_error_cfg[class][i]; + xfs_sysfs_del(&cfg->kobj); + } + xfs_sysfs_del(parent_kobj); + return error; +} + +int +xfs_error_sysfs_init( + struct xfs_mount *mp) +{ + int error; + + /* .../xfs/<dev>/error/ */ + error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, + &mp->m_kobj, "error"); + if (error) + return error; + + error = sysfs_create_file(&mp->m_error_kobj.kobject, + ATTR_LIST(fail_at_unmount)); + + if (error) + goto out_error; + + /* .../xfs/<dev>/error/metadata/ */ + error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, + "metadata", &mp->m_error_meta_kobj, + xfs_error_meta_init); + if (error) + goto out_error; + + return 0; + +out_error: + xfs_sysfs_del(&mp->m_error_kobj); + return error; +} + +void +xfs_error_sysfs_del( + struct xfs_mount *mp) +{ + struct xfs_error_cfg *cfg; + int i, j; + + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { + for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { + cfg = &mp->m_error_cfg[i][j]; + + xfs_sysfs_del(&cfg->kobj); + } + } + xfs_sysfs_del(&mp->m_error_meta_kobj); + xfs_sysfs_del(&mp->m_error_kobj); +} + +struct xfs_error_cfg * +xfs_error_get_cfg( + struct xfs_mount *mp, + int error_class, + int error) +{ + struct xfs_error_cfg *cfg; + + switch (error) { + case EIO: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; + break; + case ENOSPC: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC]; + break; + case ENODEV: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV]; + break; + default: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT]; + break; + } + + return cfg; +} diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index be692e59938d..d04637181ef2 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -58,4 +58,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } +int xfs_error_sysfs_init(struct xfs_mount *mp); +void xfs_error_sysfs_del(struct xfs_mount *mp); + #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c8d58426008e..ea94ee0fe5ea 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -364,7 +364,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); @@ -944,7 +943,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) - __field(unsigned, trans_type) __field(char, ocnt) __field(char, cnt) __field(int, curr_res) @@ -962,7 +960,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, ), TP_fast_assign( __entry->dev = log->l_mp->m_super->s_dev; - __entry->trans_type = tic->t_trans_type; __entry->ocnt = tic->t_ocnt; __entry->cnt = tic->t_cnt; __entry->curr_res = tic->t_curr_res; @@ -980,14 +977,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_block = log->l_curr_block; __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); ), - TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u " "t_unit_res %u t_flags %s reserveq %s " "writeq %s grant_reserve_cycle %d " "grant_reserve_bytes %d grant_write_cycle %d " "grant_write_bytes %d curr_cycle %d curr_block %d " "tail_cycle %d tail_block %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), __entry->ocnt, __entry->cnt, __entry->curr_res, @@ -1053,19 +1049,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, ) TRACE_EVENT(xfs_log_force, - TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), - TP_ARGS(mp, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip), + TP_ARGS(mp, lsn, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_lsn_t, lsn) + __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->lsn = lsn; + __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d lsn 0x%llx", + TP_printk("dev %d:%d lsn 0x%llx caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->lsn) + __entry->lsn, (void *)__entry->caller_ip) ) #define DEFINE_LOG_ITEM_EVENT(name) \ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 20c53666cb4b..5f3d33d16e67 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -47,47 +47,6 @@ xfs_trans_init( } /* - * This routine is called to allocate a transaction structure. - * The type parameter indicates the type of the transaction. These - * are enumerated in xfs_trans.h. - * - * Dynamically allocate the transaction structure from the transaction - * zone, initialize it, and return it to the caller. - */ -xfs_trans_t * -xfs_trans_alloc( - xfs_mount_t *mp, - uint type) -{ - xfs_trans_t *tp; - - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, type, KM_SLEEP); - tp->t_flags |= XFS_TRANS_FREEZE_PROT; - return tp; -} - -xfs_trans_t * -_xfs_trans_alloc( - xfs_mount_t *mp, - uint type, - xfs_km_flags_t memflags) -{ - xfs_trans_t *tp; - - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); - - tp = kmem_zone_zalloc(xfs_trans_zone, memflags); - tp->t_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_type = type; - tp->t_mountp = mp; - INIT_LIST_HEAD(&tp->t_items); - INIT_LIST_HEAD(&tp->t_busy); - return tp; -} - -/* * Free the transaction structure. If there is more clean up * to do when the structure is freed, add it here. */ @@ -99,7 +58,7 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); - if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); @@ -125,7 +84,6 @@ xfs_trans_dup( * Initialize the new transaction structure. */ ntp->t_magic = XFS_TRANS_HEADER_MAGIC; - ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); @@ -135,9 +93,9 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_FREEZE_PROT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); /* We gave our writer reference to the new transaction */ - tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; + tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; @@ -165,7 +123,7 @@ xfs_trans_dup( * This does not do quota reservations. That typically is done by the * caller afterwards. */ -int +static int xfs_trans_reserve( struct xfs_trans *tp, struct xfs_trans_res *resp, @@ -219,7 +177,7 @@ xfs_trans_reserve( resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, - permanent, tp->t_type); + permanent); } if (error) @@ -268,6 +226,42 @@ undo_blocks: return error; } +int +xfs_trans_alloc( + struct xfs_mount *mp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents, + uint flags, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + int error; + + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) + sb_start_intwrite(mp->m_super); + + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + atomic_inc(&mp->m_active_trans); + + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + tp->t_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_flags = flags; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + + error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + *tpp = tp; + return 0; +} + /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e7c49cf43fbc..9a462e892e4f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -90,7 +90,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, */ typedef struct xfs_trans { unsigned int t_magic; /* magic number */ - unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ unsigned int t_blk_res; /* # of blocks resvd */ @@ -148,10 +147,9 @@ typedef struct xfs_trans { /* * XFS transaction mechanism exported interfaces. */ -xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, - uint, uint); +int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, + uint blocks, uint rtextents, uint flags, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 110f1d7d86b0..ec58ff094b1d 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -32,11 +32,11 @@ static int -xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, void *value, size_t size) +xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, void *value, size_t size) { int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + struct xfs_inode *ip = XFS_I(inode); int error, asize = size; /* Convert Linux syscall to XFS internal ATTR flags */ @@ -146,7 +146,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ - return 1; + return 0; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -166,8 +166,7 @@ xfs_xattr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { char *prefix; int prefix_len; @@ -221,11 +220,15 @@ xfs_xattr_put_listent( } ssize_t -xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +xfs_vn_listxattr( + struct dentry *dentry, + char *data, + size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(dentry); + int error; /* * First read the regular on-disk attributes. @@ -239,7 +242,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - xfs_attr_list_int(&context); + error = xfs_attr_list_int(&context); + if (error) + return error; if (context.count < 0) return -ERANGE; |