summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c20
-rw-r--r--fs/attr.c23
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/acl.c4
-rw-r--r--fs/ceph/caps.c3
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/file.c11
-rw-r--r--fs/ceph/inode.c47
-rw-r--r--fs/ceph/mds_client.c175
-rw-r--r--fs/ceph/mds_client.h39
-rw-r--r--fs/ceph/mdsmap.c91
-rw-r--r--fs/ceph/super.c128
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/ceph/util.c100
-rw-r--r--fs/ceph/xattr.c7
-rw-r--r--fs/cifs/cifs_debug.c108
-rw-r--r--fs/cifs/dfs_cache.c13
-rw-r--r--fs/cifs/dfs_cache.h2
-rw-r--r--fs/cifs/inode.c13
-rw-r--r--fs/configfs/inode.c9
-rw-r--r--fs/debugfs/inode.c121
-rw-r--r--fs/eventfd.c15
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/f2fs/file.c18
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/fat/misc.c10
-rw-r--r--fs/fscache/internal.h2
-rw-r--r--fs/fscache/object-list.c11
-rw-r--r--fs/fscache/proc.c2
-rw-r--r--fs/inode.c33
-rw-r--r--fs/io_uring.c254
-rw-r--r--fs/jbd2/journal.c13
-rw-r--r--fs/jfs/jfs_debug.c14
-rw-r--r--fs/jfs/jfs_dmap.c1
-rw-r--r--fs/kernfs/inode.c6
-rw-r--r--fs/libfs.c70
-rw-r--r--fs/lockd/procfs.c12
-rw-r--r--fs/nfsd/nfsctl.c13
-rw-r--r--fs/nfsd/stats.c12
-rw-r--r--fs/ntfs/inode.c18
-rw-r--r--fs/ocfs2/file.c14
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/overlayfs/copy_up.c43
-rw-r--r--fs/overlayfs/dir.c10
-rw-r--r--fs/overlayfs/export.c28
-rw-r--r--fs/overlayfs/file.c162
-rw-r--r--fs/overlayfs/inode.c66
-rw-r--r--fs/overlayfs/namei.c38
-rw-r--r--fs/overlayfs/overlayfs.h24
-rw-r--r--fs/overlayfs/ovl_entry.h23
-rw-r--r--fs/overlayfs/readdir.c22
-rw-r--r--fs/overlayfs/super.c233
-rw-r--r--fs/overlayfs/util.c28
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/bootconfig.c89
-rw-r--r--fs/proc/cpuinfo.c12
-rw-r--r--fs/proc/generic.c38
-rw-r--r--fs/proc/inode.c76
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c13
-rw-r--r--fs/proc/kmsg.c14
-rw-r--r--fs/proc/page.c54
-rw-r--r--fs/proc/proc_net.c32
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/stat.c12
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/proc/vmcore.c10
-rw-r--r--fs/read_write.c56
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/tracefs/inode.c114
-rw-r--r--fs/ubifs/file.c18
-rw-r--r--fs/ubifs/sb.c11
-rw-r--r--fs/utimes.c4
-rw-r--r--fs/xfs/libxfs/xfs_ag.c21
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c51
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c22
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c25
-rw-r--r--fs/xfs/libxfs/xfs_btree.c45
-rw-r--r--fs/xfs/libxfs/xfs_btree.h21
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c6
-rw-r--r--fs/xfs/libxfs/xfs_sb.c17
-rw-r--r--fs/xfs/scrub/agheader_repair.c4
-rw-r--r--fs/xfs/scrub/fscounters.c3
-rw-r--r--fs/xfs/scrub/repair.c10
-rw-r--r--fs/xfs/xfs_attr_inactive.c17
-rw-r--r--fs/xfs/xfs_buf.c161
-rw-r--r--fs/xfs/xfs_buf.h33
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c8
-rw-r--r--fs/xfs/xfs_filestream.c11
-rw-r--r--fs/xfs/xfs_inode.c12
-rw-r--r--fs/xfs/xfs_log_recover.c30
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_rtalloc.c8
-rw-r--r--fs/xfs/xfs_symlink.c37
-rw-r--r--fs/xfs/xfs_trans.h14
-rw-r--r--fs/xfs/xfs_trans_buf.c61
102 files changed, 1963 insertions, 1374 deletions
diff --git a/fs/aio.c b/fs/aio.c
index a9fbad2ce5e6..5f3d3d814928 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1610,6 +1610,14 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
return 0;
}
+static void aio_poll_put_work(struct work_struct *work)
+{
+ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+
+ iocb_put(iocb);
+}
+
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1674,6 +1682,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del_init(&req->wait.entry);
if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+ struct kioctx *ctx = iocb->ki_ctx;
+
/*
* Try to complete the iocb inline if we can. Use
* irqsave/irqrestore because not all filesystems (e.g. fuse)
@@ -1683,8 +1693,14 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
- iocb_put(iocb);
+ if (iocb->ki_eventfd && eventfd_signal_count()) {
+ iocb = NULL;
+ INIT_WORK(&req->work, aio_poll_put_work);
+ schedule_work(&req->work);
+ }
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ if (iocb)
+ iocb_put(iocb);
} else {
schedule_work(&req->work);
}
diff --git a/fs/attr.c b/fs/attr.c
index df28035aa23e..b4bbdbd4c8ca 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -183,18 +183,12 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
inode->i_uid = attr->ia_uid;
if (ia_valid & ATTR_GID)
inode->i_gid = attr->ia_gid;
- if (ia_valid & ATTR_ATIME) {
- inode->i_atime = timestamp_truncate(attr->ia_atime,
- inode);
- }
- if (ia_valid & ATTR_MTIME) {
- inode->i_mtime = timestamp_truncate(attr->ia_mtime,
- inode);
- }
- if (ia_valid & ATTR_CTIME) {
- inode->i_ctime = timestamp_truncate(attr->ia_ctime,
- inode);
- }
+ if (ia_valid & ATTR_ATIME)
+ inode->i_atime = attr->ia_atime;
+ if (ia_valid & ATTR_MTIME)
+ inode->i_mtime = attr->ia_mtime;
+ if (ia_valid & ATTR_CTIME)
+ inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
@@ -268,8 +262,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
attr->ia_ctime = now;
if (!(ia_valid & ATTR_ATIME_SET))
attr->ia_atime = now;
+ else
+ attr->ia_atime = timestamp_truncate(attr->ia_atime, inode);
if (!(ia_valid & ATTR_MTIME_SET))
attr->ia_mtime = now;
+ else
+ attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode);
+
if (ia_valid & ATTR_KILL_PRIV) {
error = security_inode_need_killpriv(dentry);
if (error < 0)
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index c1da294418d1..0a0823d378db 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
- debugfs.o
+ debugfs.o util.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index aa55f412a6e3..26be6520d3fb 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -222,8 +222,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
- err = ceph_pagelist_encode_string(pagelist,
- XATTR_NAME_POSIX_ACL_DEFAULT, len);
+ ceph_pagelist_encode_string(pagelist,
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2);
if (err < 0)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9d09bb53c1ab..28ae0c134700 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -908,7 +908,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
ci_node);
if (!__cap_is_valid(cap))
continue;
- __touch_cap(cap);
+ if (cap->issued & mask)
+ __touch_cap(cap);
}
}
return 1;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index c281f32b54f7..fb7cabd98e7b 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -33,7 +33,7 @@ static int mdsmap_show(struct seq_file *s, void *p)
seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
- for (i = 0; i < mdsmap->m_num_mds; i++) {
+ for (i = 0; i < mdsmap->possible_max_rank; i++) {
struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2e4764fd1872..d0cd0aba5843 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1186,7 +1186,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
struct dentry *dn = di->dentry;
struct ceph_mds_client *mdsc;
- dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n",
+ dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
di, dn, dn, di->offset);
if (!list_empty(&di->lease_list)) {
@@ -1567,7 +1567,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
inode = d_inode(dentry);
}
- dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
+ dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 11929d2bb594..c3b8e8e0bf17 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1974,6 +1974,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
return -EOPNOTSUPP;
+ if (!src_fsc->have_copy_from2)
+ return -EOPNOTSUPP;
+
/*
* Striped file layouts require that we copy partial objects, but the
* OSD copy-from operation only supports full-object copies. Limit
@@ -2101,8 +2104,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
&dst_oid, &dst_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0);
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
+ dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
if (err) {
+ if (err == -EOPNOTSUPP) {
+ src_fsc->have_copy_from2 = false;
+ pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ }
dout("ceph_osdc_copy_from returned %d\n", err);
if (!ret)
ret = err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index c07407586ce8..d01710a16a4a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,11 +55,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode->i_state & I_NEW)
dout("get_inode created new inode %p %llx.%llx ino %llx\n",
inode, ceph_vinop(inode), (u64)inode->i_ino);
- unlock_new_inode(inode);
- }
dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
vino.snap, inode);
@@ -88,6 +86,10 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+
return inode;
}
@@ -728,8 +730,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
static int fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_reply_info_in *iinfo,
struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session,
- unsigned long ttl_from, int cap_fmode,
+ struct ceph_mds_session *session, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
@@ -754,8 +755,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
info_caps = le32_to_cpu(info->cap.caps);
/* prealloc new cap struct */
- if (info_caps && ceph_snap(inode) == CEPH_NOSNAP)
+ if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
new_cap = ceph_get_cap(mdsc, caps_reservation);
+ if (!new_cap)
+ return -ENOMEM;
+ }
/*
* prealloc xattr data, if it looks like we'll need it. only
@@ -1237,7 +1241,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
if (dir) {
err = fill_inode(dir, NULL,
&rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1,
+ session, -1,
&req->r_caps_reservation);
if (err < 0)
goto done;
@@ -1302,18 +1306,22 @@ retry_lookup:
err = PTR_ERR(in);
goto done;
}
- req->r_target_inode = in;
err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
- session, req->r_request_started,
+ session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
- rinfo->head->result == 0) ? req->r_fmode : -1,
+ rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
pr_err("fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
+ if (in->i_state & I_NEW)
+ discard_new_inode(in);
goto done;
}
+ req->r_target_inode = in;
+ if (in->i_state & I_NEW)
+ unlock_new_inode(in);
}
/*
@@ -1493,12 +1501,18 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
continue;
}
rc = fill_inode(in, NULL, &rde->inode, NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation);
+ -1, &req->r_caps_reservation);
if (rc < 0) {
pr_err("fill_inode badness on %p got %d\n", in, rc);
err = rc;
+ if (in->i_state & I_NEW) {
+ ihold(in);
+ discard_new_inode(in);
+ }
+ } else if (in->i_state & I_NEW) {
+ unlock_new_inode(in);
}
+
/* avoid calling iput_final() in mds dispatch threads */
ceph_async_iput(in);
}
@@ -1694,19 +1708,24 @@ retry_lookup:
}
ret = fill_inode(in, NULL, &rde->inode, NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation);
+ -1, &req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
/* avoid calling iput_final() in mds
* dispatch threads */
+ if (in->i_state & I_NEW) {
+ ihold(in);
+ discard_new_inode(in);
+ }
ceph_async_iput(in);
}
d_drop(dn);
err = ret;
goto next_item;
}
+ if (in->i_state & I_NEW)
+ unlock_new_inode(in);
if (d_really_is_negative(dn)) {
if (ceph_security_xattr_deadlock(in)) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 145d46ba25ae..bbbbddf71326 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -9,6 +9,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
+#include <linux/bits.h>
#include "super.h"
#include "mds_client.h"
@@ -530,6 +531,7 @@ const char *ceph_session_state_name(int s)
case CEPH_MDS_SESSION_OPEN: return "open";
case CEPH_MDS_SESSION_HUNG: return "hung";
case CEPH_MDS_SESSION_CLOSING: return "closing";
+ case CEPH_MDS_SESSION_CLOSED: return "closed";
case CEPH_MDS_SESSION_RESTARTING: return "restarting";
case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
case CEPH_MDS_SESSION_REJECTED: return "rejected";
@@ -537,7 +539,7 @@ const char *ceph_session_state_name(int s)
}
}
-static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
{
if (refcount_inc_not_zero(&s->s_ref)) {
dout("mdsc get_session %p %d -> %d\n", s,
@@ -568,7 +570,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
{
if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
return NULL;
- return get_session(mdsc->sessions[mds]);
+ return ceph_get_mds_session(mdsc->sessions[mds]);
}
static bool __have_session(struct ceph_mds_client *mdsc, int mds)
@@ -597,7 +599,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *s;
- if (mds >= mdsc->mdsmap->m_num_mds)
+ if (mds >= mdsc->mdsmap->possible_max_rank)
return ERR_PTR(-EINVAL);
s = kzalloc(sizeof(*s), GFP_NOFS);
@@ -674,7 +676,6 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
dout("__unregister_session mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
- s->s_state = 0;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
atomic_dec(&mdsc->num_sessions);
@@ -878,7 +879,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry)
* Called under mdsc->mutex.
*/
static int __choose_mds(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+ struct ceph_mds_request *req,
+ bool *random)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -888,6 +890,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
u32 hash = req->r_direct_hash;
bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ if (random)
+ *random = false;
+
/*
* is there a specific mds we should try? ignore hint if we have
* no session and the mds is not up (active or recovering).
@@ -895,7 +900,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (req->r_resend_mds >= 0 &&
(__have_session(mdsc, req->r_resend_mds) ||
ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
- dout("choose_mds using resend_mds mds%d\n",
+ dout("%s using resend_mds mds%d\n", __func__,
req->r_resend_mds);
return req->r_resend_mds;
}
@@ -913,7 +918,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_lock();
inode = get_nonsnap_parent(req->r_dentry);
rcu_read_unlock();
- dout("__choose_mds using snapdir's parent %p\n", inode);
+ dout("%s using snapdir's parent %p\n", __func__, inode);
}
} else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
@@ -933,7 +938,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
/* direct snapped/virtual snapdir requests
* based on parent dir inode */
inode = get_nonsnap_parent(parent);
- dout("__choose_mds using nonsnap parent %p\n", inode);
+ dout("%s using nonsnap parent %p\n", __func__, inode);
} else {
/* dentry target */
inode = d_inode(req->r_dentry);
@@ -949,8 +954,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_unlock();
}
- dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
- (int)hash, mode);
+ dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
+ hash, mode);
if (!inode)
goto random;
ci = ceph_inode(inode);
@@ -968,30 +973,33 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
get_random_bytes(&r, 1);
r %= frag.ndist;
mds = frag.dist[r];
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (%d/%d)\n",
- inode, ceph_vinop(inode),
- frag.frag, mds,
- (int)r, frag.ndist);
+ dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
+ __func__, inode, ceph_vinop(inode),
+ frag.frag, mds, (int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
+ CEPH_MDS_STATE_ACTIVE &&
+ !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
goto out;
}
/* since this file/dir wasn't known to be
* replicated, then we want to look for the
* authoritative mds. */
- mode = USE_AUTH_MDS;
if (frag.mds >= 0) {
/* choose auth mds */
mds = frag.mds;
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (auth)\n",
- inode, ceph_vinop(inode), frag.frag, mds);
+ dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
+ __func__, inode, ceph_vinop(inode),
+ frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
- goto out;
+ CEPH_MDS_STATE_ACTIVE) {
+ if (mode == USE_ANY_MDS &&
+ !ceph_mdsmap_is_laggy(mdsc->mdsmap,
+ mds))
+ goto out;
+ }
}
+ mode = USE_AUTH_MDS;
}
}
@@ -1007,7 +1015,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
goto random;
}
mds = cap->session->s_mds;
- dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+ dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
inode, ceph_vinop(inode), mds,
cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
@@ -1018,8 +1026,11 @@ out:
return mds;
random:
+ if (random)
+ *random = true;
+
mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
- dout("choose_mds chose random mds%d\n", mds);
+ dout("%s chose random mds%d\n", __func__, mds);
return mds;
}
@@ -1045,20 +1056,21 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
return msg;
}
+static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
+#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
static void encode_supported_features(void **p, void *end)
{
- static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
- static const size_t count = ARRAY_SIZE(bits);
+ static const size_t count = ARRAY_SIZE(feature_bits);
if (count > 0) {
size_t i;
- size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8;
+ size_t size = FEATURE_BYTES(count);
BUG_ON(*p + 4 + size > end);
ceph_encode_32(p, size);
memset(*p, 0, size);
for (i = 0; i < count; i++)
- ((unsigned char*)(*p))[i / 8] |= 1 << (bits[i] % 8);
+ ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
*p += size;
} else {
BUG_ON(*p + 4 > end);
@@ -1079,6 +1091,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options;
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
+ size_t size, count;
void *p, *end;
const char* metadata[][2] = {
@@ -1096,8 +1109,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
strlen(metadata[i][1]);
metadata_key_count++;
}
+
/* supported feature */
- extra_bytes += 4 + 8;
+ size = 0;
+ count = ARRAY_SIZE(feature_bits);
+ if (count > 0)
+ size = FEATURE_BYTES(count);
+ extra_bytes += 4 + size;
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
@@ -1117,7 +1135,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v2
+ * ClientSession messages with metadata are v3
*/
msg->hdr.version = cpu_to_le16(3);
msg->hdr.compat_version = cpu_to_le16(1);
@@ -1219,7 +1237,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- if (mds >= mdsc->mdsmap->m_num_mds)
+ if (mds >= mdsc->mdsmap->possible_max_rank)
return;
mi = &mdsc->mdsmap->m_info[mds];
@@ -1967,7 +1985,7 @@ void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
if (mdsc->stopping)
return;
- get_session(session);
+ ceph_get_mds_session(session);
if (queue_work(mdsc->fsc->cap_wq,
&session->s_cap_release_work)) {
dout("cap release work queued\n");
@@ -2072,7 +2090,6 @@ struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
- struct timespec64 ts;
if (!req)
return ERR_PTR(-ENOMEM);
@@ -2091,8 +2108,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- ktime_get_coarse_real_ts64(&ts);
- req->r_stamp = timespec64_trunc(ts, mdsc->fsc->sb->s_time_gran);
+ ktime_get_coarse_real_ts64(&req->r_stamp);
req->r_op = op;
req->r_direct_mode = mode;
@@ -2518,6 +2534,26 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
}
/*
+ * called under mdsc->mutex
+ */
+static int __send_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_mds_request *req,
+ bool drop_cap_releases)
+{
+ int err;
+
+ err = __prepare_send_request(mdsc, req, session->s_mds,
+ drop_cap_releases);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+
+ return err;
+}
+
+/*
* send request, or put it on the appropriate wait list.
*/
static void __do_request(struct ceph_mds_client *mdsc,
@@ -2526,6 +2562,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session = NULL;
int mds = -1;
int err = 0;
+ bool random;
if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
@@ -2558,15 +2595,14 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (!(mdsc->fsc->mount_options->flags &
CEPH_MOUNT_OPT_MOUNTWAIT) &&
!ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
- err = -ENOENT;
- pr_info("probably no mds server is up\n");
+ err = -EHOSTUNREACH;
goto finish;
}
}
put_request_session(req);
- mds = __choose_mds(mdsc, req);
+ mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
dout("do_request no mds or not active, waiting for map\n");
@@ -2583,7 +2619,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
}
- req->r_session = get_session(session);
+ req->r_session = ceph_get_mds_session(session);
dout("do_request mds%d session %p state %s\n", mds, session,
ceph_session_state_name(session->s_state));
@@ -2594,8 +2630,12 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto out_session;
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session);
+ /* retry the same mds later */
+ if (random)
+ req->r_resend_mds = mds;
+ }
list_add(&req->r_wait, &session->s_waiting);
goto out_session;
}
@@ -2606,11 +2646,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
- err = __prepare_send_request(mdsc, req, mds, false);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
+ err = __send_request(mdsc, session, req, false);
out_session:
ceph_put_mds_session(session);
@@ -2863,7 +2899,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
} else {
- int mds = __choose_mds(mdsc, req);
+ int mds = __choose_mds(mdsc, req, NULL);
if (mds >= 0 && mds != req->r_session->s_mds) {
dout("but auth changed, so resending\n");
__do_request(mdsc, req);
@@ -2879,6 +2915,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
+ /* last request during umount? */
+ if (mdsc->stopping && !__get_oldest_req(mdsc))
+ complete_all(&mdsc->safe_umount_waiters);
+
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
/*
* We already handled the unsafe response, now do the
@@ -2889,9 +2929,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
*/
dout("got safe reply %llu, mds%d\n", tid, mds);
- /* last unsafe request during umount? */
- if (mdsc->stopping && !__get_oldest_req(mdsc))
- complete_all(&mdsc->safe_umount_waiters);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -3106,7 +3143,7 @@ static void handle_session(struct ceph_mds_session *session,
mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) {
- get_session(session);
+ ceph_get_mds_session(session);
__unregister_session(mdsc, session);
}
/* FIXME: this ttl calculation is generous */
@@ -3144,6 +3181,7 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect denied\n", session->s_mds);
+ session->s_state = CEPH_MDS_SESSION_CLOSED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
wake = 2; /* for good measure */
@@ -3211,7 +3249,6 @@ bad:
return;
}
-
/*
* called under session->mutex.
*/
@@ -3220,18 +3257,12 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
{
struct ceph_mds_request *req, *nreq;
struct rb_node *p;
- int err;
dout("replay_unsafe_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
- list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
- err = __prepare_send_request(mdsc, req, session->s_mds, true);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
- }
+ list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
+ __send_request(mdsc, session, req, true);
/*
* also re-send old requests when MDS enters reconnect stage. So that MDS
@@ -3246,14 +3277,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
if (req->r_attempts == 0)
continue; /* only old requests */
if (req->r_session &&
- req->r_session->s_mds == session->s_mds) {
- err = __prepare_send_request(mdsc, req,
- session->s_mds, true);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
- }
+ req->r_session->s_mds == session->s_mds)
+ __send_request(mdsc, session, req, true);
}
mutex_unlock(&mdsc->mutex);
}
@@ -3764,7 +3789,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch);
- for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
if (!mdsc->sessions[i])
continue;
s = mdsc->sessions[i];
@@ -3778,9 +3803,9 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state));
- if (i >= newmap->m_num_mds) {
+ if (i >= newmap->possible_max_rank) {
/* force close session for stopped mds */
- get_session(s);
+ ceph_get_mds_session(s);
__unregister_session(mdsc, s);
__wake_requests(mdsc, &s->s_waiting);
mutex_unlock(&mdsc->mutex);
@@ -3835,7 +3860,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
}
- for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
continue;
@@ -4381,7 +4406,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i]) {
- session = get_session(mdsc->sessions[i]);
+ session = ceph_get_mds_session(mdsc->sessions[i]);
__unregister_session(mdsc, session);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -4609,11 +4634,8 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- if (get_session(s)) {
- dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
+ if (ceph_get_mds_session(s))
return con;
- }
- dout("mdsc con_get %p FAIL\n", s);
return NULL;
}
@@ -4621,7 +4643,6 @@ static void con_put(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
ceph_put_mds_session(s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 14c7e8c49970..27a7446e10d3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -17,22 +17,31 @@
#include <linux/ceph/auth.h>
/* The first 8 bits are reserved for old ceph releases */
-#define CEPHFS_FEATURE_MIMIC 8
-#define CEPHFS_FEATURE_REPLY_ENCODING 9
-#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
-#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
-#define CEPHFS_FEATURE_MULTI_RECONNECT 12
+enum ceph_feature_type {
+ CEPHFS_FEATURE_MIMIC = 8,
+ CEPHFS_FEATURE_REPLY_ENCODING,
+ CEPHFS_FEATURE_RECLAIM_CLIENT,
+ CEPHFS_FEATURE_LAZY_CAP_WANTED,
+ CEPHFS_FEATURE_MULTI_RECONNECT,
+
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+};
-#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
+/*
+ * This will always have the highest feature bit value
+ * as the last element of the array.
+ */
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
+ \
+ CEPHFS_FEATURE_MAX, \
}
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
-
/*
* Some lock dependencies:
*
@@ -151,7 +160,8 @@ enum {
CEPH_MDS_SESSION_RESTARTING = 5,
CEPH_MDS_SESSION_RECONNECTING = 6,
CEPH_MDS_SESSION_CLOSING = 7,
- CEPH_MDS_SESSION_REJECTED = 8,
+ CEPH_MDS_SESSION_CLOSED = 8,
+ CEPH_MDS_SESSION_REJECTED = 9,
};
struct ceph_mds_session {
@@ -174,6 +184,7 @@ struct ceph_mds_session {
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
+ refcount_t s_ref;
struct list_head s_caps; /* all caps issued by this session */
struct ceph_cap *s_cap_iterator;
int s_nr_caps;
@@ -188,7 +199,6 @@ struct ceph_mds_session {
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -224,6 +234,7 @@ struct ceph_mds_request {
struct rb_node r_node;
struct ceph_mds_client *r_mdsc;
+ struct kref r_kref;
int r_op; /* mds op code */
/* operation on what? */
@@ -294,7 +305,6 @@ struct ceph_mds_request {
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
- struct kref r_kref;
struct list_head r_wait;
struct completion r_completion;
struct completion r_safe_completion;
@@ -451,15 +461,10 @@ extern const char *ceph_mds_op_name(int op);
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
-static inline struct ceph_mds_session *
-ceph_get_mds_session(struct ceph_mds_session *s)
-{
- refcount_inc(&s->s_ref);
- return s;
-}
-
extern const char *ceph_session_state_name(int s);
+extern struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s);
extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 471bac335fae..889627817e52 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -13,30 +13,25 @@
#include "super.h"
+#define CEPH_MDS_IS_READY(i, ignore_laggy) \
+ (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
-/*
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
- */
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
{
int n = 0;
int i, j;
- /* special case for one mds */
- if (1 == m->m_num_mds && m->m_info[0].state > 0)
- return 0;
-
/* count */
- for (i = 0; i < m->m_num_mds; i++)
- if (m->m_info[i].state > 0)
+ for (i = 0; i < m->possible_max_rank; i++)
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
n++;
if (n == 0)
return -1;
/* pick */
n = prandom_u32() % n;
- for (j = 0, i = 0; i < m->m_num_mds; i++) {
- if (m->m_info[i].state > 0)
+ for (j = 0, i = 0; i < m->possible_max_rank; i++) {
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
if (j > n)
break;
@@ -45,6 +40,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
return i;
}
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+ int mds;
+
+ mds = __mdsmap_get_random_mds(m, false);
+ if (mds == m->possible_max_rank || mds == -1)
+ mds = __mdsmap_get_random_mds(m, true);
+
+ return mds == m->possible_max_rank ? -1 : mds;
+}
+
#define __decode_and_drop_type(p, end, type, bad) \
do { \
if (*p + sizeof(type) > end) \
@@ -138,14 +147,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
- m->m_num_mds = m->m_max_mds;
- m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
+ /*
+ * pick out the active nodes as the m_num_active_mds, the
+ * m_num_active_mds maybe larger than m_max_mds when decreasing
+ * the max_mds in cluster side, in other case it should less
+ * than or equal to m_max_mds.
+ */
+ m->m_num_active_mds = n = ceph_decode_32(p);
+
+ /*
+ * the possible max rank, it maybe larger than the m_num_active_mds,
+ * for example if the mds_max == 2 in the cluster, when the MDS(0)
+ * was laggy and being replaced by a new MDS, we will temporarily
+ * receive a new mds map with n_num_mds == 1 and the active MDS(1),
+ * and the mds rank >= m_num_active_mds.
+ */
+ m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
+
+ m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
if (!m->m_info)
goto nomem;
/* pick out active nodes from mds_info (state > 0) */
- n = ceph_decode_32(p);
for (i = 0; i < n; i++) {
u64 global_id;
u32 namelen;
@@ -215,18 +239,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_mds_state_name(state),
laggy ? "(laggy)" : "");
- if (mds < 0 || state <= 0)
+ if (mds < 0 || mds >= m->possible_max_rank) {
+ pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
continue;
+ }
- if (mds >= m->m_num_mds) {
- int new_num = max(mds + 1, m->m_num_mds * 2);
- void *new_m_info = krealloc(m->m_info,
- new_num * sizeof(*m->m_info),
- GFP_NOFS | __GFP_ZERO);
- if (!new_m_info)
- goto nomem;
- m->m_info = new_m_info;
- m->m_num_mds = new_num;
+ if (state <= 0) {
+ pr_warn("mdsmap_decode got incorrect state(%s)\n",
+ ceph_mds_state_name(state));
+ continue;
}
info = &m->m_info[mds];
@@ -247,14 +268,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL;
}
}
- if (m->m_num_mds > m->m_max_mds) {
- /* find max up mds */
- for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
- if (i == 0 || m->m_info[i-1].state > 0)
- break;
- }
- m->m_num_mds = i;
- }
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
@@ -296,14 +309,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
for (i = 0; i < n; i++) {
s32 mds = ceph_decode_32(p);
- if (mds >= 0 && mds < m->m_num_mds) {
+ if (mds >= 0 && mds < m->possible_max_rank) {
if (m->m_info[mds].laggy)
num_laggy++;
}
}
m->m_num_laggy = num_laggy;
- if (n > m->m_num_mds) {
+ if (n > m->possible_max_rank) {
void *new_m_info = krealloc(m->m_info,
n * sizeof(*m->m_info),
GFP_NOFS | __GFP_ZERO);
@@ -311,7 +324,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
goto nomem;
m->m_info = new_m_info;
}
- m->m_num_mds = n;
+ m->possible_max_rank = n;
}
/* inc */
@@ -382,7 +395,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->m_num_mds; i++)
+ for (i = 0; i < m->possible_max_rank; i++)
kfree(m->m_info[i].export_targets);
kfree(m->m_info);
kfree(m->m_data_pg_pools);
@@ -396,9 +409,9 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false;
if (m->m_damaged)
return false;
- if (m->m_num_laggy > 0)
+ if (m->m_num_laggy == m->m_num_active_mds)
return false;
- for (i = 0; i < m->m_num_mds; i++) {
+ for (i = 0; i < m->possible_max_rank; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
nr_active++;
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 29a795f975df..bfb8aead0555 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -107,7 +107,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-
static int ceph_sync_fs(struct super_block *sb, int wait)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
@@ -211,7 +210,6 @@ struct ceph_parse_opts_ctx {
/*
* Parse the source parameter. Distinguish the server list from the path.
- * Internally we do not include the leading '/' in the path.
*
* The source will look like:
* <server_spec>[,<server_spec>...]:[<path>]
@@ -232,12 +230,15 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
dev_name_end = strchr(dev_name, '/');
if (dev_name_end) {
- if (strlen(dev_name_end) > 1) {
- kfree(fsopt->server_path);
- fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
- if (!fsopt->server_path)
- return -ENOMEM;
- }
+ kfree(fsopt->server_path);
+
+ /*
+ * The server_path will include the whole chars from userland
+ * including the leading '/'.
+ */
+ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+ if (!fsopt->server_path)
+ return -ENOMEM;
} else {
dev_name_end = dev_name + strlen(dev_name);
}
@@ -461,6 +462,73 @@ static int strcmp_null(const char *s1, const char *s2)
return strcmp(s1, s2);
}
+/**
+ * path_remove_extra_slash - Remove the extra slashes in the server path
+ * @server_path: the server path and could be NULL
+ *
+ * Return NULL if the path is NULL or only consists of "/", or a string
+ * without any extra slashes including the leading slash(es) and the
+ * slash(es) at the end of the server path, such as:
+ * "//dir1////dir2///" --> "dir1/dir2"
+ */
+static char *path_remove_extra_slash(const char *server_path)
+{
+ const char *path = server_path;
+ const char *cur, *end;
+ char *buf, *p;
+ int len;
+
+ /* if the server path is omitted */
+ if (!path)
+ return NULL;
+
+ /* remove all the leading slashes */
+ while (*path == '/')
+ path++;
+
+ /* if the server path only consists of slashes */
+ if (*path == '\0')
+ return NULL;
+
+ len = strlen(path);
+
+ buf = kmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ end = path + len;
+ p = buf;
+ do {
+ cur = strchr(path, '/');
+ if (!cur)
+ cur = end;
+
+ len = cur - path;
+
+ /* including one '/' */
+ if (cur != end)
+ len += 1;
+
+ memcpy(p, path, len);
+ p += len;
+
+ while (cur <= end && *cur == '/')
+ cur++;
+ path = cur;
+ } while (path < end);
+
+ *p = '\0';
+
+ /*
+ * remove the last slash if there has and just to make sure that
+ * we will get something like "dir1/dir2"
+ */
+ if (*(--p) == '/')
+ *p = '\0';
+
+ return buf;
+}
+
static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_options *new_opt,
struct ceph_fs_client *fsc)
@@ -468,6 +536,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_mount_options *fsopt1 = new_fsopt;
struct ceph_mount_options *fsopt2 = fsc->mount_options;
int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+ char *p1, *p2;
int ret;
ret = memcmp(fsopt1, fsopt2, ofs);
@@ -480,9 +549,21 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
if (ret)
return ret;
- ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+
+ p1 = path_remove_extra_slash(fsopt1->server_path);
+ if (IS_ERR(p1))
+ return PTR_ERR(p1);
+ p2 = path_remove_extra_slash(fsopt2->server_path);
+ if (IS_ERR(p2)) {
+ kfree(p1);
+ return PTR_ERR(p2);
+ }
+ ret = strcmp_null(p1, p2);
+ kfree(p1);
+ kfree(p2);
if (ret)
return ret;
+
ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
if (ret)
return ret;
@@ -637,6 +718,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->sb = NULL;
fsc->mount_state = CEPH_MOUNT_MOUNTING;
fsc->filp_gen = 1;
+ fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0);
@@ -788,7 +870,6 @@ static void destroy_caches(void)
ceph_fscache_unregister();
}
-
/*
* ceph_umount_begin - initiate forced umount. Tear down down the
* mount, skipping steps that may hang while waiting for server(s).
@@ -868,9 +949,6 @@ out:
return root;
}
-
-
-
/*
* mount: join the ceph cluster, and open root directory.
*/
@@ -885,7 +963,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
- const char *path;
+ const char *path, *p;
err = __ceph_open_session(fsc->client, started);
if (err < 0)
goto out;
@@ -897,17 +975,22 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto out;
}
- if (!fsc->mount_options->server_path) {
- path = "";
- dout("mount opening path \\t\n");
- } else {
- path = fsc->mount_options->server_path + 1;
- dout("mount opening path %s\n", path);
+ p = path_remove_extra_slash(fsc->mount_options->server_path);
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ goto out;
}
+ /* if the server path is omitted or just consists of '/' */
+ if (!p)
+ path = "";
+ else
+ path = p;
+ dout("mount opening path '%s'\n", path);
ceph_fs_debugfs_init(fsc);
root = open_root_dentry(fsc, path, started);
+ kfree(p);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
@@ -1070,6 +1153,11 @@ static int ceph_get_tree(struct fs_context *fc)
return 0;
out_splat:
+ if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
+ pr_info("No mds server is up or the cluster is laggy\n");
+ err = -EHOSTUNREACH;
+ }
+
ceph_mdsc_close_sessions(fsc->mdsc);
deactivate_locked_super(sb);
goto out_final;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3bf1a01cd536..1e456a9011bb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -106,6 +106,8 @@ struct ceph_fs_client {
unsigned long last_auto_reconnect;
bool blacklisted;
+ bool have_copy_from2;
+
u32 filp_gen;
loff_t max_file_size;
diff --git a/fs/ceph/util.c b/fs/ceph/util.c
new file mode 100644
index 000000000000..2c34875675bf
--- /dev/null
+++ b/fs/ceph/util.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+ __u32 su = layout->stripe_unit;
+ __u32 sc = layout->stripe_count;
+ __u32 os = layout->object_size;
+
+ /* stripe unit, object size must be non-zero, 64k increment */
+ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ /* object size must be a multiple of stripe unit */
+ if (os < su || os % su)
+ return 0;
+ /* stripe count must be non-zero */
+ if (!sc)
+ return 0;
+ return 1;
+}
+
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+ fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+ fl->object_size = le32_to_cpu(legacy->fl_object_size);
+ fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
+ if (fl->pool_id == 0 && fl->stripe_unit == 0 &&
+ fl->stripe_count == 0 && fl->object_size == 0)
+ fl->pool_id = -1;
+}
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+ legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+ legacy->fl_object_size = cpu_to_le32(fl->object_size);
+ if (fl->pool_id >= 0)
+ legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
+ else
+ legacy->fl_pg_pool = 0;
+}
+
+int ceph_flags_to_mode(int flags)
+{
+ int mode;
+
+#ifdef O_DIRECTORY /* fixme */
+ if ((flags & O_DIRECTORY) == O_DIRECTORY)
+ return CEPH_FILE_MODE_PIN;
+#endif
+
+ switch (flags & O_ACCMODE) {
+ case O_WRONLY:
+ mode = CEPH_FILE_MODE_WR;
+ break;
+ case O_RDONLY:
+ mode = CEPH_FILE_MODE_RD;
+ break;
+ case O_RDWR:
+ case O_ACCMODE: /* this is what the VFS does */
+ mode = CEPH_FILE_MODE_RDWR;
+ break;
+ }
+#ifdef O_LAZY
+ if (flags & O_LAZY)
+ mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+ return mode;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+ int caps = CEPH_CAP_PIN;
+
+ if (mode & CEPH_FILE_MODE_RD)
+ caps |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+ if (mode & CEPH_FILE_MODE_WR)
+ caps |= CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+ if (mode & CEPH_FILE_MODE_LAZY)
+ caps |= CEPH_CAP_FILE_LAZYIO;
+
+ return caps;
+}
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index cb18ee637cb7..7b8a070a782d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -655,7 +655,7 @@ static int __build_xattrs(struct inode *inode)
u32 len;
const char *name, *val;
struct ceph_inode_info *ci = ceph_inode(inode);
- int xattr_version;
+ u64 xattr_version;
struct ceph_inode_xattr **xattrs = NULL;
int err = 0;
int i;
@@ -851,7 +851,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name,
ci->i_xattrs.version, ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
@@ -1078,7 +1078,8 @@ retry:
}
}
- dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+ dout("setxattr %p name '%s' issued %s\n", inode, name,
+ ceph_cap_string(issued));
__build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, name_len, val_len);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 19f6e592b941..276e4b5ea8e0 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -611,12 +611,12 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file)
return single_open(file, cifs_stats_proc_show, NULL);
}
-static const struct file_operations cifs_stats_proc_fops = {
- .open = cifs_stats_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = cifs_stats_proc_write,
+static const struct proc_ops cifs_stats_proc_ops = {
+ .proc_open = cifs_stats_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = cifs_stats_proc_write,
};
#ifdef CONFIG_CIFS_SMB_DIRECT
@@ -640,12 +640,12 @@ static int name##_open(struct inode *inode, struct file *file) \
return single_open(file, name##_proc_show, NULL); \
} \
\
-static const struct file_operations cifs_##name##_proc_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
- .write = name##_write, \
+static const struct proc_ops cifs_##name##_proc_fops = { \
+ .proc_open = name##_open, \
+ .proc_read = seq_read, \
+ .proc_lseek = seq_lseek, \
+ .proc_release = single_release, \
+ .proc_write = name##_write, \
}
PROC_FILE_DEFINE(rdma_readwrite_threshold);
@@ -659,11 +659,11 @@ PROC_FILE_DEFINE(smbd_receive_credit_max);
#endif
static struct proc_dir_entry *proc_fs_cifs;
-static const struct file_operations cifsFYI_proc_fops;
-static const struct file_operations cifs_lookup_cache_proc_fops;
-static const struct file_operations traceSMB_proc_fops;
-static const struct file_operations cifs_security_flags_proc_fops;
-static const struct file_operations cifs_linux_ext_proc_fops;
+static const struct proc_ops cifsFYI_proc_ops;
+static const struct proc_ops cifs_lookup_cache_proc_ops;
+static const struct proc_ops traceSMB_proc_ops;
+static const struct proc_ops cifs_security_flags_proc_ops;
+static const struct proc_ops cifs_linux_ext_proc_ops;
void
cifs_proc_init(void)
@@ -678,18 +678,18 @@ cifs_proc_init(void)
proc_create_single("open_files", 0400, proc_fs_cifs,
cifs_debug_files_proc_show);
- proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops);
- proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops);
- proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops);
+ proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops);
+ proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops);
+ proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops);
proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs,
- &cifs_linux_ext_proc_fops);
+ &cifs_linux_ext_proc_ops);
proc_create("SecurityFlags", 0644, proc_fs_cifs,
- &cifs_security_flags_proc_fops);
+ &cifs_security_flags_proc_ops);
proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
- &cifs_lookup_cache_proc_fops);
+ &cifs_lookup_cache_proc_ops);
#ifdef CONFIG_CIFS_DFS_UPCALL
- proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops);
+ proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops);
#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
@@ -774,12 +774,12 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
return count;
}
-static const struct file_operations cifsFYI_proc_fops = {
- .open = cifsFYI_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = cifsFYI_proc_write,
+static const struct proc_ops cifsFYI_proc_ops = {
+ .proc_open = cifsFYI_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = cifsFYI_proc_write,
};
static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
@@ -805,12 +805,12 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file,
return count;
}
-static const struct file_operations cifs_linux_ext_proc_fops = {
- .open = cifs_linux_ext_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = cifs_linux_ext_proc_write,
+static const struct proc_ops cifs_linux_ext_proc_ops = {
+ .proc_open = cifs_linux_ext_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = cifs_linux_ext_proc_write,
};
static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v)
@@ -836,12 +836,12 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file,
return count;
}
-static const struct file_operations cifs_lookup_cache_proc_fops = {
- .open = cifs_lookup_cache_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = cifs_lookup_cache_proc_write,
+static const struct proc_ops cifs_lookup_cache_proc_ops = {
+ .proc_open = cifs_lookup_cache_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = cifs_lookup_cache_proc_write,
};
static int traceSMB_proc_show(struct seq_file *m, void *v)
@@ -867,12 +867,12 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
return count;
}
-static const struct file_operations traceSMB_proc_fops = {
- .open = traceSMB_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = traceSMB_proc_write,
+static const struct proc_ops traceSMB_proc_ops = {
+ .proc_open = traceSMB_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = traceSMB_proc_write,
};
static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
@@ -978,12 +978,12 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
return count;
}
-static const struct file_operations cifs_security_flags_proc_fops = {
- .open = cifs_security_flags_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = cifs_security_flags_proc_write,
+static const struct proc_ops cifs_security_flags_proc_ops = {
+ .proc_open = cifs_security_flags_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = cifs_security_flags_proc_write,
};
#else
inline void cifs_proc_init(void)
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 9a384d1e27b4..43c1b43a07ec 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -8,6 +8,7 @@
#include <linux/jhash.h>
#include <linux/ktime.h>
#include <linux/slab.h>
+#include <linux/proc_fs.h>
#include <linux/nls.h>
#include <linux/workqueue.h>
#include "cifsglob.h"
@@ -211,12 +212,12 @@ static int dfscache_proc_open(struct inode *inode, struct file *file)
return single_open(file, dfscache_proc_show, NULL);
}
-const struct file_operations dfscache_proc_fops = {
- .open = dfscache_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = dfscache_proc_write,
+const struct proc_ops dfscache_proc_ops = {
+ .proc_open = dfscache_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = dfscache_proc_write,
};
#ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index 76c732943f5f..99ee44f8ad07 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -24,7 +24,7 @@ struct dfs_cache_tgt_iterator {
extern int dfs_cache_init(void);
extern void dfs_cache_destroy(void);
-extern const struct file_operations dfscache_proc_fops;
+extern const struct proc_ops dfscache_proc_ops;
extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_codepage, int remap,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9b547f7f5f5d..676e96a7a9f0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -113,6 +113,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
}
/* revalidate if mtime or size have changed */
+ fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) &&
cifs_i->server_eof == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
@@ -162,6 +163,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
cifs_revalidate_cache(inode, fattr);
spin_lock(&inode->i_lock);
+ fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
+ fattr->cf_atime = timestamp_truncate(fattr->cf_atime, inode);
+ fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode);
/* we do not want atime to be less than mtime, it broke some apps */
if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0)
inode->i_atime = fattr->cf_mtime;
@@ -329,8 +333,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
fattr->cf_uid = cifs_sb->mnt_uid;
fattr->cf_gid = cifs_sb->mnt_gid;
- ktime_get_real_ts64(&fattr->cf_mtime);
- fattr->cf_mtime = timespec64_trunc(fattr->cf_mtime, sb->s_time_gran);
+ ktime_get_coarse_real_ts64(&fattr->cf_mtime);
fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime;
fattr->cf_nlink = 2;
fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL;
@@ -609,10 +612,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
if (info->LastAccessTime)
fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
- else {
- ktime_get_real_ts64(&fattr->cf_atime);
- fattr->cf_atime = timespec64_trunc(fattr->cf_atime, sb->s_time_gran);
- }
+ else
+ ktime_get_coarse_real_ts64(&fattr->cf_atime);
fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 680aba9c00d5..fd0b5dd68f9e 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -76,14 +76,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
if (ia_valid & ATTR_GID)
sd_iattr->ia_gid = iattr->ia_gid;
if (ia_valid & ATTR_ATIME)
- sd_iattr->ia_atime = timestamp_truncate(iattr->ia_atime,
- inode);
+ sd_iattr->ia_atime = iattr->ia_atime;
if (ia_valid & ATTR_MTIME)
- sd_iattr->ia_mtime = timestamp_truncate(iattr->ia_mtime,
- inode);
+ sd_iattr->ia_mtime = iattr->ia_mtime;
if (ia_valid & ATTR_CTIME)
- sd_iattr->ia_ctime = timestamp_truncate(iattr->ia_ctime,
- inode);
+ sd_iattr->ia_ctime = iattr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = iattr->ia_mode;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index dc6cffc4feba..e742dfc66933 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -332,7 +332,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
parent = debugfs_mount->mnt_root;
inode_lock(d_inode(parent));
- dentry = lookup_one_len(name, parent, strlen(name));
+ if (unlikely(IS_DEADDIR(d_inode(parent))))
+ dentry = ERR_PTR(-ENOENT);
+ else
+ dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
if (d_is_dir(dentry))
pr_err("Directory '%s' with parent '%s' already present!\n",
@@ -681,62 +684,15 @@ static void __debugfs_file_removed(struct dentry *dentry)
wait_for_completion(&fsd->active_users_drained);
}
-static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
-{
- int ret = 0;
-
- if (simple_positive(dentry)) {
- dget(dentry);
- if (d_is_dir(dentry)) {
- ret = simple_rmdir(d_inode(parent), dentry);
- if (!ret)
- fsnotify_rmdir(d_inode(parent), dentry);
- } else {
- simple_unlink(d_inode(parent), dentry);
- fsnotify_unlink(d_inode(parent), dentry);
- }
- if (!ret)
- d_delete(dentry);
- if (d_is_reg(dentry))
- __debugfs_file_removed(dentry);
- dput(dentry);
- }
- return ret;
-}
-
-/**
- * debugfs_remove - removes a file or directory from the debugfs filesystem
- * @dentry: a pointer to a the dentry of the file or directory to be
- * removed. If this parameter is NULL or an error value, nothing
- * will be done.
- *
- * This function removes a file or directory in debugfs that was previously
- * created with a call to another debugfs function (like
- * debugfs_create_file() or variants thereof.)
- *
- * This function is required to be called in order for the file to be
- * removed, no automatic cleanup of files will happen when a module is
- * removed, you are responsible here.
- */
-void debugfs_remove(struct dentry *dentry)
+static void remove_one(struct dentry *victim)
{
- struct dentry *parent;
- int ret;
-
- if (IS_ERR_OR_NULL(dentry))
- return;
-
- parent = dentry->d_parent;
- inode_lock(d_inode(parent));
- ret = __debugfs_remove(dentry, parent);
- inode_unlock(d_inode(parent));
- if (!ret)
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ if (d_is_reg(victim))
+ __debugfs_file_removed(victim);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
-EXPORT_SYMBOL_GPL(debugfs_remove);
/**
- * debugfs_remove_recursive - recursively removes a directory
+ * debugfs_remove - recursively removes a directory
* @dentry: a pointer to a the dentry of the directory to be removed. If this
* parameter is NULL or an error value, nothing will be done.
*
@@ -748,65 +704,16 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
* removed, no automatic cleanup of files will happen when a module is
* removed, you are responsible here.
*/
-void debugfs_remove_recursive(struct dentry *dentry)
+void debugfs_remove(struct dentry *dentry)
{
- struct dentry *child, *parent;
-
if (IS_ERR_OR_NULL(dentry))
return;
- parent = dentry;
- down:
- inode_lock(d_inode(parent));
- loop:
- /*
- * The parent->d_subdirs is protected by the d_lock. Outside that
- * lock, the child can be unlinked and set to be freed which can
- * use the d_u.d_child as the rcu head and corrupt this list.
- */
- spin_lock(&parent->d_lock);
- list_for_each_entry(child, &parent->d_subdirs, d_child) {
- if (!simple_positive(child))
- continue;
-
- /* perhaps simple_empty(child) makes more sense */
- if (!list_empty(&child->d_subdirs)) {
- spin_unlock(&parent->d_lock);
- inode_unlock(d_inode(parent));
- parent = child;
- goto down;
- }
-
- spin_unlock(&parent->d_lock);
-
- if (!__debugfs_remove(child, parent))
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-
- /*
- * The parent->d_lock protects agaist child from unlinking
- * from d_subdirs. When releasing the parent->d_lock we can
- * no longer trust that the next pointer is valid.
- * Restart the loop. We'll skip this one with the
- * simple_positive() check.
- */
- goto loop;
- }
- spin_unlock(&parent->d_lock);
-
- inode_unlock(d_inode(parent));
- child = parent;
- parent = parent->d_parent;
- inode_lock(d_inode(parent));
-
- if (child != dentry)
- /* go up */
- goto loop;
-
- if (!__debugfs_remove(child, parent))
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
- inode_unlock(d_inode(parent));
+ simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
+ simple_recursive_removal(dentry, remove_one);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
-EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
+EXPORT_SYMBOL_GPL(debugfs_remove);
/**
* debugfs_rename - rename a file/directory in the debugfs filesystem
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa0ea8c55e8..78e41c7c3d05 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -24,6 +24,8 @@
#include <linux/seq_file.h>
#include <linux/idr.h>
+DEFINE_PER_CPU(int, eventfd_wake_count);
+
static DEFINE_IDA(eventfd_ida);
struct eventfd_ctx {
@@ -60,12 +62,25 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
{
unsigned long flags;
+ /*
+ * Deadlock or stack overflow issues can happen if we recurse here
+ * through waitqueue wakeup handlers. If the caller users potentially
+ * nested waitqueues with custom wakeup handlers, then it should
+ * check eventfd_signal_count() before calling this function. If
+ * it returns true, the eventfd_signal() call should be deferred to a
+ * safe context.
+ */
+ if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ return 0;
+
spin_lock_irqsave(&ctx->wqh.lock, flags);
+ this_cpu_inc(eventfd_wake_count);
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ this_cpu_dec(eventfd_wake_count);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88b213bd32bc..8434217549b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6043,7 +6043,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
bh = ext4_bread(handle, inode, blk,
EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
- } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
+ } while (PTR_ERR(bh) == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
if (IS_ERR(bh))
return PTR_ERR(bh);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 86ddbb55d2b1..0d4da644df3b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -829,18 +829,12 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
inode->i_uid = attr->ia_uid;
if (ia_valid & ATTR_GID)
inode->i_gid = attr->ia_gid;
- if (ia_valid & ATTR_ATIME) {
- inode->i_atime = timestamp_truncate(attr->ia_atime,
- inode);
- }
- if (ia_valid & ATTR_MTIME) {
- inode->i_mtime = timestamp_truncate(attr->ia_mtime,
- inode);
- }
- if (ia_valid & ATTR_CTIME) {
- inode->i_ctime = timestamp_truncate(attr->ia_ctime,
- inode);
- }
+ if (ia_valid & ATTR_ATIME)
+ inode->i_atime = attr->ia_atime;
+ if (ia_valid & ATTR_MTIME)
+ inode->i_mtime = attr->ia_mtime;
+ if (ia_valid & ATTR_CTIME)
+ inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3314a0f3405e..9d02cdcdbb07 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -875,7 +875,7 @@ static int truncate_dnode(struct dnode_of_data *dn)
/* get direct node */
page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
- if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
+ if (PTR_ERR(page) == -ENOENT)
return 1;
else if (IS_ERR(page))
return PTR_ERR(page);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1e08bd54c5fb..f1b2a1fc2a6a 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -271,6 +271,14 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts)
{
return (struct timespec64){ ts.tv_sec & ~1ULL, 0 };
}
+
+static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts)
+{
+ if (ts.tv_nsec)
+ ts.tv_nsec -= ts.tv_nsec % 10000000UL;
+ return ts;
+}
+
/*
* truncate the various times with appropriate granularity:
* root inode:
@@ -308,7 +316,7 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
}
if (flags & S_CTIME) {
if (sbi->options.isvfat)
- inode->i_ctime = timespec64_trunc(*now, 10000000);
+ inode->i_ctime = fat_timespec64_trunc_10ms(*now);
else
inode->i_ctime = fat_timespec64_trunc_2secs(*now);
}
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 9616af3768e1..08e91efbce53 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -111,7 +111,7 @@ extern void fscache_enqueue_object(struct fscache_object *);
* object-list.c
*/
#ifdef CONFIG_FSCACHE_OBJECT_LIST
-extern const struct file_operations fscache_objlist_fops;
+extern const struct proc_ops fscache_objlist_proc_ops;
extern void fscache_objlist_add(struct fscache_object *);
extern void fscache_objlist_remove(struct fscache_object *);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 72ebfe578f40..e106a1a1600d 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -7,6 +7,7 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/key.h>
@@ -405,9 +406,9 @@ static int fscache_objlist_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-const struct file_operations fscache_objlist_fops = {
- .open = fscache_objlist_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = fscache_objlist_release,
+const struct proc_ops fscache_objlist_proc_ops = {
+ .proc_open = fscache_objlist_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = fscache_objlist_release,
};
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 5523446e2952..90a7bc22f7e1 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -35,7 +35,7 @@ int __init fscache_proc_init(void)
#ifdef CONFIG_FSCACHE_OBJECT_LIST
if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
- &fscache_objlist_fops))
+ &fscache_objlist_proc_ops))
goto error_objects;
#endif
diff --git a/fs/inode.c b/fs/inode.c
index ea15c6d9f274..c7418b0b4168 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1683,12 +1683,9 @@ EXPORT_SYMBOL(generic_update_time);
*/
static int update_time(struct inode *inode, struct timespec64 *time, int flags)
{
- int (*update_time)(struct inode *, struct timespec64 *, int);
-
- update_time = inode->i_op->update_time ? inode->i_op->update_time :
- generic_update_time;
-
- return update_time(inode, time, flags);
+ if (inode->i_op->update_time)
+ return inode->i_op->update_time(inode, time, flags);
+ return generic_update_time(inode, time, flags);
}
/**
@@ -2154,30 +2151,6 @@ void inode_nohighmem(struct inode *inode)
EXPORT_SYMBOL(inode_nohighmem);
/**
- * timespec64_trunc - Truncate timespec64 to a granularity
- * @t: Timespec64
- * @gran: Granularity in ns.
- *
- * Truncate a timespec64 to a granularity. Always rounds down. gran must
- * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
- */
-struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran)
-{
- /* Avoid division in the common cases 1 ns and 1 s. */
- if (gran == 1) {
- /* nothing */
- } else if (gran == NSEC_PER_SEC) {
- t.tv_nsec = 0;
- } else if (gran > 1 && gran < NSEC_PER_SEC) {
- t.tv_nsec -= t.tv_nsec % gran;
- } else {
- WARN(1, "illegal file time granularity: %u", gran);
- }
- return t;
-}
-EXPORT_SYMBOL(timespec64_trunc);
-
-/**
* timestamp_truncate - Truncate timespec to a granularity
* @t: Timespec
* @inode: inode being updated
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1806afddfea5..77f22c3da30f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -585,8 +585,7 @@ struct io_submit_state {
* io_kiocb alloc cache
*/
void *reqs[IO_IOPOLL_BATCH];
- unsigned int free_reqs;
- unsigned int cur_req;
+ unsigned int free_reqs;
/*
* File reference cache
@@ -754,6 +753,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
+static void io_ring_file_ref_flush(struct fixed_file_data *data);
static struct kmem_cache *req_cachep;
@@ -1020,21 +1020,28 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
+ if (!ctx->cq_ev_fd)
+ return false;
if (!ctx->eventfd_async)
return true;
return io_wq_current_is_worker() || in_interrupt();
}
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
- if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
+ if (trigger_ev)
eventfd_signal(ctx->cq_ev_fd, 1);
}
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
+}
+
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
@@ -1183,12 +1190,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
ret = 1;
}
state->free_reqs = ret - 1;
- state->cur_req = 1;
- req = state->reqs[0];
+ req = state->reqs[ret - 1];
} else {
- req = state->reqs[state->cur_req];
state->free_reqs--;
- state->cur_req++;
+ req = state->reqs[state->free_reqs];
}
got_it:
@@ -1855,9 +1860,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
unsigned ioprio;
int ret;
- if (!req->file)
- return -EBADF;
-
if (S_ISREG(file_inode(req->file)->i_mode))
req->flags |= REQ_F_ISREG;
@@ -1866,8 +1868,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = req->file->f_pos;
}
- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+ kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+ if (unlikely(ret))
+ return ret;
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -1879,10 +1884,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
} else
kiocb->ki_ioprio = get_current_ioprio();
- ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
- if (unlikely(ret))
- return ret;
-
/* don't allow async punt if RWF_NOWAIT was requested */
if ((kiocb->ki_flags & IOCB_NOWAIT) ||
(req->file->f_flags & O_NONBLOCK))
@@ -2164,10 +2165,12 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
{
if (!io_op_defs[req->opcode].async_ctx)
return 0;
- if (!req->io && io_alloc_async_ctx(req))
- return -ENOMEM;
+ if (!req->io) {
+ if (io_alloc_async_ctx(req))
+ return -ENOMEM;
- io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+ io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+ }
req->work.func = io_rw_async;
return 0;
}
@@ -2724,9 +2727,16 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
struct io_fadvise *fa = &req->fadvise;
int ret;
- /* DONTNEED may block, others _should_ not */
- if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
- return -EAGAIN;
+ if (force_nonblock) {
+ switch (fa->advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ break;
+ default:
+ return -EAGAIN;
+ }
+ }
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
@@ -2837,16 +2847,13 @@ static void io_close_finish(struct io_wq_work **workptr)
int ret;
ret = filp_close(req->close.put_file, req->work.files);
- if (ret < 0) {
+ if (ret < 0)
req_set_fail_links(req);
- }
io_cqring_add_event(req, ret);
}
fput(req->close.put_file);
- /* we bypassed the re-issue, drop the submission reference */
- io_put_req(req);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
@@ -2888,7 +2895,13 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
eagain:
req->work.func = io_close_finish;
- return -EAGAIN;
+ /*
+ * Do manual async queue here to avoid grabbing files - we don't
+ * need the files, and it'll cause io_close_finish() to close
+ * the file again and cause a double CQE entry for this request
+ */
+ io_queue_async_work(req);
+ return 0;
}
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3083,7 +3096,8 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
else if (force_nonblock)
flags |= MSG_DONTWAIT;
- ret = __sys_sendmsg_sock(sock, &msg, flags);
+ msg.msg_flags = flags;
+ ret = sock_sendmsg(sock, &msg);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
@@ -3109,6 +3123,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->len = READ_ONCE(sqe->len);
if (!io || req->opcode == IORING_OP_RECV)
return 0;
@@ -3227,7 +3242,7 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
else if (force_nonblock)
flags |= MSG_DONTWAIT;
- ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
+ ret = sock_recvmsg(sock, &msg, flags);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
@@ -3561,6 +3576,14 @@ static void io_poll_flush(struct io_wq_work **workptr)
__io_poll_flush(req->ctx, nodes);
}
+static void io_poll_trigger_evfd(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+ eventfd_signal(req->ctx->cq_ev_fd, 1);
+ io_put_req(req);
+}
+
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
@@ -3586,14 +3609,22 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (llist_empty(&ctx->poll_llist) &&
spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+ bool trigger_ev;
+
hash_del(&req->hash_node);
io_poll_complete(req, mask, 0);
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
- req = NULL;
+ trigger_ev = io_should_trigger_evfd(ctx);
+ if (trigger_ev && eventfd_signal_count()) {
+ trigger_ev = false;
+ req->work.func = io_poll_trigger_evfd;
+ } else {
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ req = NULL;
+ }
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ __io_cqring_ev_posted(ctx, trigger_ev);
} else {
req->result = mask;
req->llist_node.next = NULL;
@@ -4815,8 +4846,7 @@ static void io_submit_state_end(struct io_submit_state *state)
blk_finish_plug(&state->plug);
io_file_put(state);
if (state->free_reqs)
- kmem_cache_free_bulk(req_cachep, state->free_reqs,
- &state->reqs[state->cur_req]);
+ kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
}
/*
@@ -5041,7 +5071,8 @@ static int io_sq_thread(void *data)
* reap events and wake us up.
*/
if (inflight ||
- (!time_after(jiffies, timeout) && ret != -EBUSY)) {
+ (!time_after(jiffies, timeout) && ret != -EBUSY &&
+ !percpu_ref_is_dying(&ctx->refs))) {
cond_resched();
continue;
}
@@ -5231,15 +5262,10 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
if (!data)
return -ENXIO;
- /* protect against inflight atomic switch, which drops the ref */
- percpu_ref_get(&data->refs);
- /* wait for existing switches */
- flush_work(&data->ref_work);
percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
- wait_for_completion(&data->done);
- percpu_ref_put(&data->refs);
- /* flush potential new switch */
flush_work(&data->ref_work);
+ wait_for_completion(&data->done);
+ io_ring_file_ref_flush(data);
percpu_ref_exit(&data->refs);
__io_sqe_files_unregister(ctx);
@@ -5477,14 +5503,11 @@ struct io_file_put {
struct completion *done;
};
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_ring_file_ref_flush(struct fixed_file_data *data)
{
struct io_file_put *pfile, *tmp;
- struct fixed_file_data *data;
struct llist_node *node;
- data = container_of(work, struct fixed_file_data, ref_work);
-
while ((node = llist_del_all(&data->put_llist)) != NULL) {
llist_for_each_entry_safe(pfile, tmp, node, llist) {
io_ring_file_put(data->ctx, pfile->file);
@@ -5494,7 +5517,14 @@ static void io_ring_file_ref_switch(struct work_struct *work)
kfree(pfile);
}
}
+}
+static void io_ring_file_ref_switch(struct work_struct *work)
+{
+ struct fixed_file_data *data;
+
+ data = container_of(work, struct fixed_file_data, ref_work);
+ io_ring_file_ref_flush(data);
percpu_ref_get(&data->refs);
percpu_ref_switch_to_percpu(&data->refs);
}
@@ -5505,8 +5535,14 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
data = container_of(ref, struct fixed_file_data, refs);
- /* we can't safely switch from inside this context, punt to wq */
- queue_work(system_wq, &data->ref_work);
+ /*
+ * We can't safely switch from inside this context, punt to wq. If
+ * the table ref is going away, the table is being unregistered.
+ * Don't queue up the async work for that case, the caller will
+ * handle it.
+ */
+ if (!percpu_ref_is_dying(&data->refs))
+ queue_work(system_wq, &data->ref_work);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -6295,6 +6331,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
percpu_ref_kill(&ctx->refs);
mutex_unlock(&ctx->uring_lock);
+ /*
+ * Wait for sq thread to idle, if we have one. It won't spin on new
+ * work after we've killed the ctx ref above. This is important to do
+ * before we cancel existing commands, as the thread could otherwise
+ * be queueing new work post that. If that's work we need to cancel,
+ * it could cause shutdown to hang.
+ */
+ while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
+ cpu_relax();
+
io_kill_timeouts(ctx);
io_poll_remove_all(ctx);
@@ -6501,6 +6547,80 @@ out_fput:
return submitted ? submitted : ret;
}
+static int io_uring_show_cred(int id, void *p, void *data)
+{
+ const struct cred *cred = p;
+ struct seq_file *m = data;
+ struct user_namespace *uns = seq_user_ns(m);
+ struct group_info *gi;
+ kernel_cap_t cap;
+ unsigned __capi;
+ int g;
+
+ seq_printf(m, "%5d\n", id);
+ seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
+ seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
+ seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
+ seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
+ seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
+ seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
+ seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
+ seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
+ seq_puts(m, "\n\tGroups:\t");
+ gi = cred->group_info;
+ for (g = 0; g < gi->ngroups; g++) {
+ seq_put_decimal_ull(m, g ? " " : "",
+ from_kgid_munged(uns, gi->gid[g]));
+ }
+ seq_puts(m, "\n\tCapEff:\t");
+ cap = cred->cap_effective;
+ CAP_FOR_EACH_U32(__capi)
+ seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+{
+ int i;
+
+ mutex_lock(&ctx->uring_lock);
+ seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct fixed_file_table *table;
+ struct file *f;
+
+ table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+ f = table->files[i & IORING_FILE_TABLE_MASK];
+ if (f)
+ seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
+ else
+ seq_printf(m, "%5u: <none>\n", i);
+ }
+ seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+
+ seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
+ (unsigned int) buf->len);
+ }
+ if (!idr_is_empty(&ctx->personality_idr)) {
+ seq_printf(m, "Personalities:\n");
+ idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
+ }
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct io_ring_ctx *ctx = f->private_data;
+
+ if (percpu_ref_tryget(&ctx->refs)) {
+ __io_uring_show_fdinfo(ctx, m);
+ percpu_ref_put(&ctx->refs);
+ }
+}
+
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.flush = io_uring_flush,
@@ -6511,6 +6631,7 @@ static const struct file_operations io_uring_fops = {
#endif
.poll = io_uring_poll,
.fasync = io_uring_fasync,
+ .show_fdinfo = io_uring_show_fdinfo,
};
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
@@ -6963,6 +7084,39 @@ out_fput:
static int __init io_uring_init(void)
{
+#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
+ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
+ BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
+} while (0)
+
+#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
+ __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
+ BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
+ BUILD_BUG_SQE_ELEM(0, __u8, opcode);
+ BUILD_BUG_SQE_ELEM(1, __u8, flags);
+ BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
+ BUILD_BUG_SQE_ELEM(4, __s32, fd);
+ BUILD_BUG_SQE_ELEM(8, __u64, off);
+ BUILD_BUG_SQE_ELEM(8, __u64, addr2);
+ BUILD_BUG_SQE_ELEM(16, __u64, addr);
+ BUILD_BUG_SQE_ELEM(24, __u32, len);
+ BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
+ BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
+ BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
+ BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
+ BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
+ BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
+ BUILD_BUG_SQE_ELEM(32, __u64, user_data);
+ BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
+ BUILD_BUG_SQE_ELEM(42, __u16, personality);
+
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
return 0;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 60bf8ff78913..eb8ca446d1ab 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1074,12 +1074,11 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static const struct file_operations jbd2_seq_info_fops = {
- .owner = THIS_MODULE,
- .open = jbd2_seq_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = jbd2_seq_info_release,
+static const struct proc_ops jbd2_info_proc_ops = {
+ .proc_open = jbd2_seq_info_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = jbd2_seq_info_release,
};
static struct proc_dir_entry *proc_jbd2_stats;
@@ -1089,7 +1088,7 @@ static void jbd2_stats_proc_init(journal_t *journal)
journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
proc_create_data("info", S_IRUGO, journal->j_proc_entry,
- &jbd2_seq_info_fops, journal);
+ &jbd2_info_proc_ops, journal);
}
}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 888cdd685a1e..44b62b3c322e 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -43,12 +43,12 @@ static ssize_t jfs_loglevel_proc_write(struct file *file,
return count;
}
-static const struct file_operations jfs_loglevel_proc_fops = {
- .open = jfs_loglevel_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = jfs_loglevel_proc_write,
+static const struct proc_ops jfs_loglevel_proc_ops = {
+ .proc_open = jfs_loglevel_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = jfs_loglevel_proc_write,
};
#endif
@@ -68,7 +68,7 @@ void jfs_proc_init(void)
#endif
#ifdef CONFIG_JFS_DEBUG
proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show);
- proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops);
+ proc_create("loglevel", 0, base, &jfs_loglevel_proc_ops);
#endif
}
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index caade185e568..7dfcab2a2da6 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -4027,7 +4027,6 @@ static int dbGetL2AGSize(s64 nblocks)
*/
#define MAXL0PAGES (1 + LPERCTL)
#define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES)
-#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES)
/*
* convert number of map pages to the zero origin top dmapctl level
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index eac277c63d42..d0f7a5abd9a9 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -160,9 +160,9 @@ static inline void set_inode_attr(struct inode *inode,
{
inode->i_uid = attrs->ia_uid;
inode->i_gid = attrs->ia_gid;
- inode->i_atime = timestamp_truncate(attrs->ia_atime, inode);
- inode->i_mtime = timestamp_truncate(attrs->ia_mtime, inode);
- inode->i_ctime = timestamp_truncate(attrs->ia_ctime, inode);
+ inode->i_atime = attrs->ia_atime;
+ inode->i_mtime = attrs->ia_mtime;
+ inode->i_ctime = attrs->ia_ctime;
}
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
diff --git a/fs/libfs.c b/fs/libfs.c
index 1463b038ffc4..c686bd9caac6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -19,6 +19,7 @@
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -239,6 +240,75 @@ const struct inode_operations simple_dir_inode_operations = {
};
EXPORT_SYMBOL(simple_dir_inode_operations);
+static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
+{
+ struct dentry *child = NULL;
+ struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
+
+ spin_lock(&parent->d_lock);
+ while ((p = p->next) != &parent->d_subdirs) {
+ struct dentry *d = container_of(p, struct dentry, d_child);
+ if (simple_positive(d)) {
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(d))
+ child = dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ if (likely(child))
+ break;
+ }
+ }
+ spin_unlock(&parent->d_lock);
+ dput(prev);
+ return child;
+}
+
+void simple_recursive_removal(struct dentry *dentry,
+ void (*callback)(struct dentry *))
+{
+ struct dentry *this = dget(dentry);
+ while (true) {
+ struct dentry *victim = NULL, *child;
+ struct inode *inode = this->d_inode;
+
+ inode_lock(inode);
+ if (d_is_dir(this))
+ inode->i_flags |= S_DEAD;
+ while ((child = find_next_child(this, victim)) == NULL) {
+ // kill and ascend
+ // update metadata while it's still locked
+ inode->i_ctime = current_time(inode);
+ clear_nlink(inode);
+ inode_unlock(inode);
+ victim = this;
+ this = this->d_parent;
+ inode = this->d_inode;
+ inode_lock(inode);
+ if (simple_positive(victim)) {
+ d_invalidate(victim); // avoid lost mounts
+ if (d_is_dir(victim))
+ fsnotify_rmdir(inode, victim);
+ else
+ fsnotify_unlink(inode, victim);
+ if (callback)
+ callback(victim);
+ dput(victim); // unpin it
+ }
+ if (victim == dentry) {
+ inode->i_ctime = inode->i_mtime =
+ current_time(inode);
+ if (d_is_dir(dentry))
+ drop_nlink(inode);
+ inode_unlock(inode);
+ dput(dentry);
+ return;
+ }
+ }
+ inode_unlock(inode);
+ this = child;
+ }
+}
+EXPORT_SYMBOL(simple_recursive_removal);
+
static const struct super_operations simple_super_operations = {
.statfs = simple_statfs,
};
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
index ca9228a56d65..a01f08c8c2f3 100644
--- a/fs/lockd/procfs.c
+++ b/fs/lockd/procfs.c
@@ -60,11 +60,11 @@ nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
}
-static const struct file_operations lockd_end_grace_operations = {
- .write = nlm_end_grace_write,
- .read = nlm_end_grace_read,
- .llseek = default_llseek,
- .release = simple_transaction_release,
+static const struct proc_ops lockd_end_grace_proc_ops = {
+ .proc_write = nlm_end_grace_write,
+ .proc_read = nlm_end_grace_read,
+ .proc_lseek = default_llseek,
+ .proc_release = simple_transaction_release,
};
int __init
@@ -76,7 +76,7 @@ lockd_create_procfs(void)
if (!entry)
return -ENOMEM;
entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
- &lockd_end_grace_operations);
+ &lockd_end_grace_proc_ops);
if (!entry) {
remove_proc_entry("fs/lockd", NULL);
return -ENOMEM;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 11b42c523f04..7eb919f1b13f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -157,11 +157,11 @@ static int exports_proc_open(struct inode *inode, struct file *file)
return exports_net_open(current->nsproxy->net_ns, file);
}
-static const struct file_operations exports_proc_operations = {
- .open = exports_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops exports_proc_ops = {
+ .proc_open = exports_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int exports_nfsd_open(struct inode *inode, struct file *file)
@@ -1431,8 +1431,7 @@ static int create_proc_exports_entry(void)
entry = proc_mkdir("fs/nfs", NULL);
if (!entry)
return -ENOMEM;
- entry = proc_create("exports", 0, entry,
- &exports_proc_operations);
+ entry = proc_create("exports", 0, entry, &exports_proc_ops);
if (!entry) {
remove_proc_entry("fs/nfs", NULL);
return -ENOMEM;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 9bce3b913189..b1bc582b0493 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -84,17 +84,17 @@ static int nfsd_proc_open(struct inode *inode, struct file *file)
return single_open(file, nfsd_proc_show, NULL);
}
-static const struct file_operations nfsd_proc_fops = {
- .open = nfsd_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops nfsd_proc_ops = {
+ .proc_open = nfsd_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
void
nfsd_stat_init(void)
{
- svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops);
+ svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
}
void
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 6c7388430ad3..d4359a1df3d5 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2899,18 +2899,12 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
ia_valid |= ATTR_MTIME | ATTR_CTIME;
}
}
- if (ia_valid & ATTR_ATIME) {
- vi->i_atime = timestamp_truncate(attr->ia_atime,
- vi);
- }
- if (ia_valid & ATTR_MTIME) {
- vi->i_mtime = timestamp_truncate(attr->ia_mtime,
- vi);
- }
- if (ia_valid & ATTR_CTIME) {
- vi->i_ctime = timestamp_truncate(attr->ia_ctime,
- vi);
- }
+ if (ia_valid & ATTR_ATIME)
+ vi->i_atime = attr->ia_atime;
+ if (ia_valid & ATTR_MTIME)
+ vi->i_mtime = attr->ia_mtime;
+ if (ia_valid & ATTR_CTIME)
+ vi->i_ctime = attr->ia_ctime;
mark_inode_dirty(vi);
out:
return err;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9876db52913a..6cd5e4924e4d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2101,17 +2101,15 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
struct buffer_head **di_bh,
int meta_level,
- int overwrite_io,
int write_sem,
int wait)
{
int ret = 0;
if (wait)
- ret = ocfs2_inode_lock(inode, NULL, meta_level);
+ ret = ocfs2_inode_lock(inode, di_bh, meta_level);
else
- ret = ocfs2_try_inode_lock(inode,
- overwrite_io ? NULL : di_bh, meta_level);
+ ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
if (ret < 0)
goto out;
@@ -2136,6 +2134,7 @@ static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
out_unlock:
brelse(*di_bh);
+ *di_bh = NULL;
ocfs2_inode_unlock(inode, meta_level);
out:
return ret;
@@ -2177,7 +2176,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
ret = ocfs2_inode_lock_for_extent_tree(inode,
&di_bh,
meta_level,
- overwrite_io,
write_sem,
wait);
if (ret < 0) {
@@ -2233,13 +2231,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
&di_bh,
meta_level,
write_sem);
+ meta_level = 1;
+ write_sem = 1;
ret = ocfs2_inode_lock_for_extent_tree(inode,
&di_bh,
meta_level,
- overwrite_io,
- 1,
+ write_sem,
wait);
- write_sem = 1;
if (ret < 0) {
if (ret != -EAGAIN)
mlog_errno(ret);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4180c3ef0a68..939df99d2dec 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -696,7 +696,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
ac, cl);
- if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+ if (PTR_ERR(bg_bh) == -ENOSPC)
bg_bh = ocfs2_block_group_alloc_discontig(handle,
alloc_inode,
ac, cl);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 6220642fe113..9fc47c2e078d 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -24,7 +24,7 @@
static int ovl_ccup_set(const char *buf, const struct kernel_param *param)
{
- pr_warn("overlayfs: \"check_copy_up\" module option is obsolete\n");
+ pr_warn("\"check_copy_up\" module option is obsolete\n");
return 0;
}
@@ -123,6 +123,9 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
loff_t old_pos = 0;
loff_t new_pos = 0;
loff_t cloned;
+ loff_t data_pos = -1;
+ loff_t hole_len;
+ bool skip_hole = false;
int error = 0;
if (len == 0)
@@ -144,7 +147,11 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
goto out;
/* Couldn't clone, so now we try to copy the data */
- /* FIXME: copy up sparse files efficiently */
+ /* Check if lower fs supports seek operation */
+ if (old_file->f_mode & FMODE_LSEEK &&
+ old_file->f_op->llseek)
+ skip_hole = true;
+
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
@@ -157,6 +164,36 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
break;
}
+ /*
+ * Fill zero for hole will cost unnecessary disk space
+ * and meanwhile slow down the copy-up speed, so we do
+ * an optimization for hole during copy-up, it relies
+ * on SEEK_DATA implementation in lower fs so if lower
+ * fs does not support it, copy-up will behave as before.
+ *
+ * Detail logic of hole detection as below:
+ * When we detect next data position is larger than current
+ * position we will skip that hole, otherwise we copy
+ * data in the size of OVL_COPY_UP_CHUNK_SIZE. Actually,
+ * it may not recognize all kind of holes and sometimes
+ * only skips partial of hole area. However, it will be
+ * enough for most of the use cases.
+ */
+
+ if (skip_hole && data_pos < old_pos) {
+ data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA);
+ if (data_pos > old_pos) {
+ hole_len = data_pos - old_pos;
+ len -= hole_len;
+ old_pos = new_pos = data_pos;
+ continue;
+ } else if (data_pos == -ENXIO) {
+ break;
+ } else if (data_pos < 0) {
+ skip_hole = false;
+ }
+ }
+
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
@@ -480,7 +517,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
}
inode_lock(temp->d_inode);
- if (c->metacopy)
+ if (S_ISREG(c->stat.mode))
err = ovl_set_size(temp, &c->stat);
if (!err)
err = ovl_set_attr(temp, &c->stat);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 29abdb1d3b5c..8e57d5372b8f 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -35,7 +35,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
dput(wdentry);
if (err) {
- pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
+ pr_err("cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
@@ -53,7 +53,7 @@ static struct dentry *ovl_lookup_temp(struct dentry *workdir)
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
- pr_err("overlayfs: workdir/%s already exists\n", name);
+ pr_err("workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
@@ -134,7 +134,7 @@ static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry,
d = lookup_one_len(dentry->d_name.name, dentry->d_parent,
dentry->d_name.len);
if (IS_ERR(d)) {
- pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n",
+ pr_warn("failed lookup after mkdir (%pd2, err=%i).\n",
dentry, err);
return PTR_ERR(d);
}
@@ -267,7 +267,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
if (inode != oip.newinode) {
- pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n",
+ pr_warn_ratelimited("newly created inode found in cache (%pd2)\n",
dentry);
}
@@ -1009,7 +1009,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
spin_unlock(&dentry->d_lock);
} else {
kfree(redirect);
- pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n",
+ pr_warn_ratelimited("failed to set redirect (%i)\n",
err);
/* Fall back to userspace copy-up */
err = -EXDEV;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 70e55588aedc..6f54d70cef27 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -30,7 +30,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry)
}
if (err) {
- pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
+ pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n",
dentry, err);
}
@@ -244,7 +244,7 @@ out:
return err;
fail:
- pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
+ pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
dentry, err, buflen, fh ? (int)fh->fb.len : 0,
fh ? fh->fb.type : 0);
goto out;
@@ -358,7 +358,7 @@ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
*/
static struct dentry *ovl_lookup_real_one(struct dentry *connected,
struct dentry *real,
- struct ovl_layer *layer)
+ const struct ovl_layer *layer)
{
struct inode *dir = d_inode(connected);
struct dentry *this, *parent = NULL;
@@ -406,7 +406,7 @@ out:
return this;
fail:
- pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
+ pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
real, layer->idx, connected, err);
this = ERR_PTR(err);
goto out;
@@ -414,17 +414,16 @@ fail:
static struct dentry *ovl_lookup_real(struct super_block *sb,
struct dentry *real,
- struct ovl_layer *layer);
+ const struct ovl_layer *layer);
/*
* Lookup an indexed or hashed overlay dentry by real inode.
*/
static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
struct dentry *real,
- struct ovl_layer *layer)
+ const struct ovl_layer *layer)
{
struct ovl_fs *ofs = sb->s_fs_info;
- struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
struct dentry *index = NULL;
struct dentry *this = NULL;
struct inode *inode;
@@ -466,7 +465,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
* recursive call walks back from indexed upper to the topmost
* connected/hashed upper parent (or up to root).
*/
- this = ovl_lookup_real(sb, upper, &upper_layer);
+ this = ovl_lookup_real(sb, upper, &ofs->layers[0]);
dput(upper);
}
@@ -487,7 +486,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
*/
static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb,
struct dentry *real,
- struct ovl_layer *layer)
+ const struct ovl_layer *layer)
{
struct dentry *next, *parent = NULL;
struct dentry *ancestor = ERR_PTR(-EIO);
@@ -540,7 +539,7 @@ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb,
*/
static struct dentry *ovl_lookup_real(struct super_block *sb,
struct dentry *real,
- struct ovl_layer *layer)
+ const struct ovl_layer *layer)
{
struct dentry *connected;
int err = 0;
@@ -631,7 +630,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb,
return connected;
fail:
- pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
+ pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
real, layer->idx, connected, err);
dput(connected);
return ERR_PTR(err);
@@ -646,8 +645,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb,
struct dentry *index)
{
struct ovl_fs *ofs = sb->s_fs_info;
- struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
- struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer;
+ const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer;
struct dentry *real = upper ?: (index ?: lowerpath->dentry);
/*
@@ -822,7 +820,7 @@ out:
return dentry;
out_err:
- pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
+ pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
fh_len, fh_type, flags, err);
dentry = ERR_PTR(err);
goto out;
@@ -831,7 +829,7 @@ out_err:
static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
- pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n");
+ pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n");
return ERR_PTR(-EACCES);
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index e235a635d9ec..a5317216de73 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -9,8 +9,19 @@
#include <linux/xattr.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/splice.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
#include "overlayfs.h"
+struct ovl_aio_req {
+ struct kiocb iocb;
+ struct kiocb *orig_iocb;
+ struct fd fd;
+};
+
+static struct kmem_cache *ovl_aio_request_cachep;
+
static char ovl_whatisit(struct inode *inode, struct inode *realinode)
{
if (realinode != ovl_inode_upper(inode))
@@ -146,7 +157,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file_inode(file);
struct fd real;
const struct cred *old_cred;
- ssize_t ret;
+ loff_t ret;
/*
* The two special cases below do not need to involve real fs,
@@ -171,7 +182,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
* limitations that are more strict than ->s_maxbytes for specific
* files, so we use the real file to perform seeks.
*/
- inode_lock(inode);
+ ovl_inode_lock(inode);
real.file->f_pos = file->f_pos;
old_cred = ovl_override_creds(inode->i_sb);
@@ -179,7 +190,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
revert_creds(old_cred);
file->f_pos = real.file->f_pos;
- inode_unlock(inode);
+ ovl_inode_unlock(inode);
fdput(real);
@@ -225,6 +236,33 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
return flags;
}
+static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
+{
+ struct kiocb *iocb = &aio_req->iocb;
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+ if (iocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(orig_iocb->ki_filp);
+
+ file_end_write(iocb->ki_filp);
+ ovl_copyattr(ovl_inode_real(inode), inode);
+ }
+
+ orig_iocb->ki_pos = iocb->ki_pos;
+ fdput(aio_req->fd);
+ kmem_cache_free(ovl_aio_request_cachep, aio_req);
+}
+
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+{
+ struct ovl_aio_req *aio_req = container_of(iocb,
+ struct ovl_aio_req, iocb);
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+ ovl_aio_cleanup_handler(aio_req);
+ orig_iocb->ki_complete(orig_iocb, res, res2);
+}
+
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
@@ -240,10 +278,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
return ret;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb));
+ if (is_sync_kiocb(iocb)) {
+ ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
+ ovl_iocb_to_rwf(iocb));
+ } else {
+ struct ovl_aio_req *aio_req;
+
+ ret = -ENOMEM;
+ aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ aio_req->fd = real;
+ real.flags = 0;
+ aio_req->orig_iocb = iocb;
+ kiocb_clone(&aio_req->iocb, iocb, real.file);
+ aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+ if (ret != -EIOCBQUEUED)
+ ovl_aio_cleanup_handler(aio_req);
+ }
+out:
revert_creds(old_cred);
-
ovl_file_accessed(file);
fdput(real);
@@ -274,15 +330,33 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- file_start_write(real.file);
- ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb));
- file_end_write(real.file);
+ if (is_sync_kiocb(iocb)) {
+ file_start_write(real.file);
+ ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
+ ovl_iocb_to_rwf(iocb));
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(ovl_inode_real(inode), inode);
+ } else {
+ struct ovl_aio_req *aio_req;
+
+ ret = -ENOMEM;
+ aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ file_start_write(real.file);
+ aio_req->fd = real;
+ real.flags = 0;
+ aio_req->orig_iocb = iocb;
+ kiocb_clone(&aio_req->iocb, iocb, real.file);
+ aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+ if (ret != -EIOCBQUEUED)
+ ovl_aio_cleanup_handler(aio_req);
+ }
+out:
revert_creds(old_cred);
-
- /* Update size */
- ovl_copyattr(ovl_inode_real(inode), inode);
-
fdput(real);
out_unlock:
@@ -291,6 +365,48 @@ out_unlock:
return ret;
}
+static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ ssize_t ret;
+ struct fd real;
+ const struct cred *old_cred;
+
+ ret = ovl_real_fdget(in, &real);
+ if (ret)
+ return ret;
+
+ old_cred = ovl_override_creds(file_inode(in)->i_sb);
+ ret = generic_file_splice_read(real.file, ppos, pipe, len, flags);
+ revert_creds(old_cred);
+
+ ovl_file_accessed(in);
+ fdput(real);
+ return ret;
+}
+
+static ssize_t
+ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ ret = ovl_real_fdget(out, &real);
+ if (ret)
+ return ret;
+
+ old_cred = ovl_override_creds(file_inode(out)->i_sb);
+ ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+ revert_creds(old_cred);
+
+ ovl_file_accessed(out);
+ fdput(real);
+ return ret;
+}
+
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct fd real;
@@ -647,7 +763,25 @@ const struct file_operations ovl_file_operations = {
.fadvise = ovl_fadvise,
.unlocked_ioctl = ovl_ioctl,
.compat_ioctl = ovl_compat_ioctl,
+ .splice_read = ovl_splice_read,
+ .splice_write = ovl_splice_write,
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
};
+
+int __init ovl_aio_request_cache_init(void)
+{
+ ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
+ sizeof(struct ovl_aio_req),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!ovl_aio_request_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ovl_aio_request_cache_destroy(void)
+{
+ kmem_cache_destroy(ovl_aio_request_cachep);
+}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index b045cf1826fc..79e8994e3bc1 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -75,10 +75,9 @@ out:
return err;
}
-static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
- struct ovl_layer *lower_layer)
+static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
{
- bool samefs = ovl_same_sb(dentry->d_sb);
+ bool samefs = ovl_same_fs(dentry->d_sb);
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
if (samefs) {
@@ -100,12 +99,10 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
* persistent for a given layer configuration.
*/
if (stat->ino >> shift) {
- pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+ pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
dentry, stat->ino, xinobits);
} else {
- if (lower_layer)
- stat->ino |= ((u64)lower_layer->fsid) << shift;
-
+ stat->ino |= ((u64)fsid) << shift;
stat->dev = dentry->d_sb->s_dev;
return 0;
}
@@ -124,15 +121,14 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
*/
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
- } else if (lower_layer && lower_layer->fsid) {
+ } else {
/*
* For non-samefs setup, if we cannot map all layers st_ino
* to a unified address space, we need to make sure that st_dev
- * is unique per lower fs. Upper layer uses real st_dev and
- * lower layers use the unique anonymous bdev assigned to the
- * lower fs.
+ * is unique per underlying fs, so we use the unique anonymous
+ * bdev assigned to the underlying fs.
*/
- stat->dev = lower_layer->fs->pseudo_dev;
+ stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev;
}
return 0;
@@ -146,8 +142,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
struct path realpath;
const struct cred *old_cred;
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
- bool samefs = ovl_same_sb(dentry->d_sb);
- struct ovl_layer *lower_layer = NULL;
+ int fsid = 0;
int err;
bool metacopy_blocks = false;
@@ -168,9 +163,9 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
* If lower filesystem supports NFS file handles, this also guaranties
* persistent st_ino across mount cycle.
*/
- if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
+ if (!is_dir || ovl_same_dev(dentry->d_sb)) {
if (!OVL_TYPE_UPPER(type)) {
- lower_layer = ovl_layer_lower(dentry);
+ fsid = ovl_layer_lower(dentry)->fsid;
} else if (OVL_TYPE_ORIGIN(type)) {
struct kstat lowerstat;
u32 lowermask = STATX_INO | STATX_BLOCKS |
@@ -200,14 +195,8 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
(!ovl_verify_lower(dentry->d_sb) &&
(is_dir || lowerstat.nlink == 1))) {
- lower_layer = ovl_layer_lower(dentry);
- /*
- * Cannot use origin st_dev;st_ino because
- * origin inode content may differ from overlay
- * inode content.
- */
- if (samefs || lower_layer->fsid)
- stat->ino = lowerstat.ino;
+ fsid = ovl_layer_lower(dentry)->fsid;
+ stat->ino = lowerstat.ino;
}
/*
@@ -241,7 +230,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
}
}
- err = ovl_map_dev_ino(dentry, stat, lower_layer);
+ err = ovl_map_dev_ino(dentry, stat, fsid);
if (err)
goto out;
@@ -527,6 +516,27 @@ static const struct address_space_operations ovl_aops = {
* [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
* [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
* [...] &type->i_mutex_dir_key (stack_depth=0)
+ *
+ * Locking order w.r.t ovl_want_write() is important for nested overlayfs.
+ *
+ * This chain is valid:
+ * - inode->i_rwsem (inode_lock[2])
+ * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0])
+ * - OVL_I(inode)->lock (ovl_inode_lock[2])
+ * - OVL_I(lowerinode)->lock (ovl_inode_lock[1])
+ *
+ * And this chain is valid:
+ * - inode->i_rwsem (inode_lock[2])
+ * - OVL_I(inode)->lock (ovl_inode_lock[2])
+ * - lowerinode->i_rwsem (inode_lock[1])
+ * - OVL_I(lowerinode)->lock (ovl_inode_lock[1])
+ *
+ * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is
+ * held, because it is in reverse order of the non-nested case using the same
+ * upper fs:
+ * - inode->i_rwsem (inode_lock[1])
+ * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0])
+ * - OVL_I(inode)->lock (ovl_inode_lock[1])
*/
#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
@@ -565,7 +575,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
* ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
* upper inode i_ino on ovl_inode_init() or ovl_inode_update().
*/
- if (ovl_same_sb(inode->i_sb) || xinobits) {
+ if (ovl_same_dev(inode->i_sb)) {
inode->i_ino = ino;
if (xinobits && fsid && !(ino >> (64 - xinobits)))
inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
@@ -698,7 +708,7 @@ unsigned int ovl_get_nlink(struct dentry *lowerdentry,
return nlink;
fail:
- pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n",
+ pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n",
upperdentry, err);
return fallback;
}
@@ -969,7 +979,7 @@ out:
return inode;
out_err:
- pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err);
+ pr_warn_ratelimited("failed to get inode (%i)\n", err);
inode = ERR_PTR(err);
goto out;
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 76ff66339173..ed9e129fae04 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -141,10 +141,10 @@ out:
return NULL;
fail:
- pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res);
+ pr_warn_ratelimited("failed to get origin (%i)\n", res);
goto out;
invalid:
- pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh);
+ pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh);
goto out;
}
@@ -322,16 +322,16 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
struct dentry *origin = NULL;
int i;
- for (i = 0; i < ofs->numlower; i++) {
+ for (i = 1; i < ofs->numlayer; i++) {
/*
* If lower fs uuid is not unique among lower fs we cannot match
* fh->uuid to layer.
*/
- if (ofs->lower_layers[i].fsid &&
- ofs->lower_layers[i].fs->bad_uuid)
+ if (ofs->layers[i].fsid &&
+ ofs->layers[i].fs->bad_uuid)
continue;
- origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt,
+ origin = ovl_decode_real_fh(fh, ofs->layers[i].mnt,
connected);
if (origin)
break;
@@ -354,13 +354,13 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
}
**stackp = (struct ovl_path){
.dentry = origin,
- .layer = &ofs->lower_layers[i]
+ .layer = &ofs->layers[i]
};
return 0;
invalid:
- pr_warn_ratelimited("overlayfs: invalid origin (%pd2, ftype=%x, origin ftype=%x).\n",
+ pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n",
upperdentry, d_inode(upperdentry)->i_mode & S_IFMT,
d_inode(origin)->i_mode & S_IFMT);
dput(origin);
@@ -449,7 +449,7 @@ out:
fail:
inode = d_inode(real);
- pr_warn_ratelimited("overlayfs: failed to verify %s (%pd2, ino=%lu, err=%i)\n",
+ pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n",
is_upper ? "upper" : "origin", real,
inode ? inode->i_ino : 0, err);
goto out;
@@ -475,7 +475,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
return upper ?: ERR_PTR(-ESTALE);
if (!d_is_dir(upper)) {
- pr_warn_ratelimited("overlayfs: invalid index upper (%pd2, upper=%pd2).\n",
+ pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n",
index, upper);
dput(upper);
return ERR_PTR(-EIO);
@@ -589,12 +589,12 @@ out:
return err;
fail:
- pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n",
+ pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n",
index, d_inode(index)->i_mode & S_IFMT, err);
goto out;
orphan:
- pr_warn_ratelimited("overlayfs: orphan index entry (%pd2, ftype=%x, nlink=%u)\n",
+ pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n",
index, d_inode(index)->i_mode & S_IFMT,
d_inode(index)->i_nlink);
err = -ENOENT;
@@ -696,7 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
index = NULL;
goto out;
}
- pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n"
+ pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n"
"overlayfs: mount with '-o index=off' to disable inodes index.\n",
d_inode(origin)->i_ino, name.len, name.name,
err);
@@ -723,13 +723,13 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
* unlinked, which means that finding a lower origin on lookup
* whose index is a whiteout should be treated as an error.
*/
- pr_warn_ratelimited("overlayfs: bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n",
+ pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n",
index, d_inode(index)->i_mode & S_IFMT,
d_inode(origin)->i_mode & S_IFMT);
goto fail;
} else if (is_dir && verify) {
if (!upper) {
- pr_warn_ratelimited("overlayfs: suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n",
+ pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n",
origin, index);
goto fail;
}
@@ -738,7 +738,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
err = ovl_verify_upper(index, upper, false);
if (err) {
if (err == -ESTALE) {
- pr_warn_ratelimited("overlayfs: suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n",
+ pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n",
upper, origin, index);
}
goto fail;
@@ -885,7 +885,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (!d.stop && poe->numlower) {
err = -ENOMEM;
- stack = kcalloc(ofs->numlower, sizeof(struct ovl_path),
+ stack = kcalloc(ofs->numlayer - 1, sizeof(struct ovl_path),
GFP_KERNEL);
if (!stack)
goto out_put_upper;
@@ -967,7 +967,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
*/
err = -EPERM;
if (d.redirect && !ofs->config.redirect_follow) {
- pr_warn_ratelimited("overlayfs: refusing to follow redirect for (%pd2)\n",
+ pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n",
dentry);
goto out_put;
}
@@ -994,7 +994,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
err = -EPERM;
if (!ofs->config.metacopy) {
- pr_warn_ratelimited("overlay: refusing to follow metacopy origin for (%pd2)\n",
+ pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n",
dentry);
goto out_put;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index f283b1d69a9e..3623d28aa4fa 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -9,6 +9,9 @@
#include <linux/fs.h>
#include "ovl_entry.h"
+#undef pr_fmt
+#define pr_fmt(fmt) "overlayfs: " fmt
+
enum ovl_path_type {
__OVL_PATH_UPPER = (1 << 0),
__OVL_PATH_MERGE = (1 << 1),
@@ -221,7 +224,6 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
-struct super_block *ovl_same_sb(struct super_block *sb);
int ovl_can_decode_fh(struct super_block *sb);
struct dentry *ovl_indexdir(struct super_block *sb);
bool ovl_index_all(struct super_block *sb);
@@ -237,7 +239,7 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_lowerdata(struct dentry *dentry);
-struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
+const struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_i_dentry_upper(struct inode *inode);
struct inode *ovl_inode_upper(struct inode *inode);
@@ -299,11 +301,21 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
}
-static inline unsigned int ovl_xino_bits(struct super_block *sb)
+/* All layers on same fs? */
+static inline bool ovl_same_fs(struct super_block *sb)
+{
+ return OVL_FS(sb)->xino_mode == 0;
+}
+
+/* All overlay inodes have same st_dev? */
+static inline bool ovl_same_dev(struct super_block *sb)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ return OVL_FS(sb)->xino_mode >= 0;
+}
- return ofs->xino_bits;
+static inline unsigned int ovl_xino_bits(struct super_block *sb)
+{
+ return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0;
}
static inline int ovl_inode_lock(struct inode *inode)
@@ -438,6 +450,8 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
/* file.c */
extern const struct file_operations ovl_file_operations;
+int __init ovl_aio_request_cache_init(void);
+void ovl_aio_request_cache_destroy(void);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 28348c44ea5b..89015ea822e7 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -24,6 +24,8 @@ struct ovl_sb {
dev_t pseudo_dev;
/* Unusable (conflicting) uuid */
bool bad_uuid;
+ /* Used as a lower layer (but maybe also as upper) */
+ bool is_lower;
};
struct ovl_layer {
@@ -38,18 +40,18 @@ struct ovl_layer {
};
struct ovl_path {
- struct ovl_layer *layer;
+ const struct ovl_layer *layer;
struct dentry *dentry;
};
/* private information held for overlayfs's superblock */
struct ovl_fs {
struct vfsmount *upper_mnt;
- unsigned int numlower;
- /* Number of unique lower sb that differ from upper sb */
- unsigned int numlowerfs;
- struct ovl_layer *lower_layers;
- struct ovl_sb *lower_fs;
+ unsigned int numlayer;
+ /* Number of unique fs among layers including upper fs */
+ unsigned int numfs;
+ const struct ovl_layer *layers;
+ struct ovl_sb *fs;
/* workbasedir is the path at workdir= mount option */
struct dentry *workbasedir;
/* workdir is the 'work' directory under workbasedir */
@@ -71,10 +73,15 @@ struct ovl_fs {
struct inode *workbasedir_trap;
struct inode *workdir_trap;
struct inode *indexdir_trap;
- /* Inode numbers in all layers do not use the high xino_bits */
- unsigned int xino_bits;
+ /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
+ int xino_mode;
};
+static inline struct ovl_fs *OVL_FS(struct super_block *sb)
+{
+ return (struct ovl_fs *)sb->s_fs_info;
+}
+
/* private information held for every overlayfs dentry */
struct ovl_entry {
union {
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 47a91c9733a5..40ac9ce2465a 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -441,7 +441,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
const char *name, int namelen)
{
if (ino >> (64 - xinobits)) {
- pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
+ pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
namelen, name, ino, xinobits);
return ino;
}
@@ -469,7 +469,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
int xinobits = ovl_xino_bits(dir->d_sb);
int err = 0;
- if (!ovl_same_sb(dir->d_sb) && !xinobits)
+ if (!ovl_same_dev(dir->d_sb))
goto out;
if (p->name[0] == '.') {
@@ -504,7 +504,13 @@ get:
if (err)
goto fail;
- WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
+ /*
+ * Directory inode is always on overlay st_dev.
+ * Non-dir with ovl_same_dev() could be on pseudo st_dev in case
+ * of xino bits overflow.
+ */
+ WARN_ON_ONCE(S_ISDIR(stat.mode) &&
+ dir->d_sb->s_dev != stat.dev);
ino = stat.ino;
} else if (xinobits && !OVL_TYPE_UPPER(type)) {
ino = ovl_remap_lower_ino(ino, xinobits,
@@ -518,7 +524,7 @@ out:
return err;
fail:
- pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n",
+ pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n",
p->name, err);
goto out;
}
@@ -685,7 +691,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
int err;
struct ovl_dir_file *od = file->private_data;
struct dentry *dir = file->f_path.dentry;
- struct ovl_layer *lower_layer = ovl_layer_lower(dir);
+ const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
.orig_ctx = ctx,
@@ -738,7 +744,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
* entries.
*/
if (ovl_xino_bits(dentry->d_sb) ||
- (ovl_same_sb(dentry->d_sb) &&
+ (ovl_same_fs(dentry->d_sb) &&
(ovl_is_impure_dir(file) ||
OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
return ovl_iterate_real(file, ctx);
@@ -965,7 +971,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
- pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
+ pr_err("lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
@@ -1147,6 +1153,6 @@ next:
out:
ovl_cache_free(&list);
if (err)
- pr_err("overlayfs: failed index dir cleanup (%i)\n", err);
+ pr_err("failed index dir cleanup (%i)\n", err);
return err;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7621ff176d15..319fe0d355b0 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -224,14 +224,14 @@ static void ovl_free_fs(struct ovl_fs *ofs)
if (ofs->upperdir_locked)
ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
mntput(ofs->upper_mnt);
- for (i = 0; i < ofs->numlower; i++) {
- iput(ofs->lower_layers[i].trap);
- mntput(ofs->lower_layers[i].mnt);
+ for (i = 1; i < ofs->numlayer; i++) {
+ iput(ofs->layers[i].trap);
+ mntput(ofs->layers[i].mnt);
}
- for (i = 0; i < ofs->numlowerfs; i++)
- free_anon_bdev(ofs->lower_fs[i].pseudo_dev);
- kfree(ofs->lower_layers);
- kfree(ofs->lower_fs);
+ kfree(ofs->layers);
+ for (i = 0; i < ofs->numfs; i++)
+ free_anon_bdev(ofs->fs[i].pseudo_dev);
+ kfree(ofs->fs);
kfree(ofs->config.lowerdir);
kfree(ofs->config.upperdir);
@@ -358,7 +358,7 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
if (ofs->config.nfs_export != ovl_nfs_export_def)
seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
"on" : "off");
- if (ofs->config.xino != ovl_xino_def())
+ if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb))
seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]);
if (ofs->config.metacopy != ovl_metacopy_def)
seq_printf(m, ",metacopy=%s",
@@ -462,7 +462,7 @@ static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode)
if (ovl_redirect_always_follow)
config->redirect_follow = true;
} else if (strcmp(mode, "nofollow") != 0) {
- pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n",
+ pr_err("bad mount option \"redirect_dir=%s\"\n",
mode);
return -EINVAL;
}
@@ -560,14 +560,15 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
break;
default:
- pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
+ pr_err("unrecognized mount option \"%s\" or missing value\n",
+ p);
return -EINVAL;
}
}
/* Workdir is useless in non-upper mount */
if (!config->upperdir && config->workdir) {
- pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+ pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
config->workdir);
kfree(config->workdir);
config->workdir = NULL;
@@ -587,7 +588,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
/* Resolve metacopy -> redirect_dir dependency */
if (config->metacopy && !config->redirect_dir) {
if (metacopy_opt && redirect_opt) {
- pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n",
+ pr_err("conflicting options: metacopy=on,redirect_dir=%s\n",
config->redirect_mode);
return -EINVAL;
}
@@ -596,7 +597,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
* There was an explicit redirect_dir=... that resulted
* in this conflict.
*/
- pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n",
+ pr_info("disabling metacopy due to redirect_dir=%s\n",
config->redirect_mode);
config->metacopy = false;
} else {
@@ -692,7 +693,7 @@ out_unlock:
out_dput:
dput(work);
out_err:
- pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
+ pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n",
ofs->config.workdir, name, -err);
work = NULL;
goto out_unlock;
@@ -716,21 +717,21 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
int err = -EINVAL;
if (!*name) {
- pr_err("overlayfs: empty lowerdir\n");
+ pr_err("empty lowerdir\n");
goto out;
}
err = kern_path(name, LOOKUP_FOLLOW, path);
if (err) {
- pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+ pr_err("failed to resolve '%s': %i\n", name, err);
goto out;
}
err = -EINVAL;
if (ovl_dentry_weird(path->dentry)) {
- pr_err("overlayfs: filesystem on '%s' not supported\n", name);
+ pr_err("filesystem on '%s' not supported\n", name);
goto out_put;
}
if (!d_is_dir(path->dentry)) {
- pr_err("overlayfs: '%s' not a directory\n", name);
+ pr_err("'%s' not a directory\n", name);
goto out_put;
}
return 0;
@@ -752,7 +753,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
if (!err)
if (ovl_dentry_remote(path->dentry)) {
- pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
+ pr_err("filesystem on '%s' not supported as upperdir\n",
tmp);
path_put_init(path);
err = -EINVAL;
@@ -769,7 +770,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
int err = vfs_statfs(path, &statfs);
if (err)
- pr_err("overlayfs: statfs failed on '%s'\n", name);
+ pr_err("statfs failed on '%s'\n", name);
else
ofs->namelen = max(ofs->namelen, statfs.f_namelen);
@@ -804,13 +805,13 @@ static int ovl_lower_dir(const char *name, struct path *path,
(ofs->config.index && ofs->config.upperdir)) && !fh_type) {
ofs->config.index = false;
ofs->config.nfs_export = false;
- pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
+ pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
name);
}
/* Check if lower fs has 32bit inode numbers */
if (fh_type != FILEID_INO32_GEN)
- ofs->xino_bits = 0;
+ ofs->xino_mode = -1;
return 0;
@@ -996,7 +997,7 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
err = PTR_ERR_OR_ZERO(trap);
if (err) {
if (err == -ELOOP)
- pr_err("overlayfs: conflicting %s path\n", name);
+ pr_err("conflicting %s path\n", name);
return err;
}
@@ -1013,11 +1014,11 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
static int ovl_report_in_use(struct ovl_fs *ofs, const char *name)
{
if (ofs->config.index) {
- pr_err("overlayfs: %s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n",
+ pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n",
name);
return -EBUSY;
} else {
- pr_warn("overlayfs: %s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n",
+ pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n",
name);
return 0;
}
@@ -1035,7 +1036,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
/* Upper fs should not be r/o */
if (sb_rdonly(upperpath->mnt->mnt_sb)) {
- pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+ pr_err("upper fs is r/o, try multi-lower layers mount\n");
err = -EINVAL;
goto out;
}
@@ -1052,7 +1053,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
upper_mnt = clone_private_mount(upperpath);
err = PTR_ERR(upper_mnt);
if (IS_ERR(upper_mnt)) {
- pr_err("overlayfs: failed to clone upperpath\n");
+ pr_err("failed to clone upperpath\n");
goto out;
}
@@ -1108,7 +1109,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
* kernel upgrade. So warn instead of erroring out.
*/
if (!err)
- pr_warn("overlayfs: upper fs needs to support d_type.\n");
+ pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0);
@@ -1116,7 +1117,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
if (ofs->tmpfile)
dput(temp);
else
- pr_warn("overlayfs: upper fs does not support tmpfile.\n");
+ pr_warn("upper fs does not support tmpfile.\n");
/*
* Check if upper/work fs supports trusted.overlay.* xattr
@@ -1126,7 +1127,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
ofs->noxattr = true;
ofs->config.index = false;
ofs->config.metacopy = false;
- pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off and metacopy=off.\n");
+ pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n");
err = 0;
} else {
vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
@@ -1136,16 +1137,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
if (ofs->config.index && !fh_type) {
ofs->config.index = false;
- pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
+ pr_warn("upper fs does not support file handles, falling back to index=off.\n");
}
/* Check if upper fs has 32bit inode numbers */
if (fh_type != FILEID_INO32_GEN)
- ofs->xino_bits = 0;
+ ofs->xino_mode = -1;
/* NFS export of r/w mount depends on index */
if (ofs->config.nfs_export && !ofs->config.index) {
- pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n");
+ pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
out:
@@ -1165,11 +1166,11 @@ static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
err = -EINVAL;
if (upperpath->mnt != workpath.mnt) {
- pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+ pr_err("workdir and upperdir must reside under the same mount\n");
goto out;
}
if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) {
- pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+ pr_err("workdir and upperdir must be separate subtrees\n");
goto out;
}
@@ -1210,7 +1211,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry,
true);
if (err) {
- pr_err("overlayfs: failed to verify upper root origin\n");
+ pr_err("failed to verify upper root origin\n");
goto out;
}
@@ -1233,18 +1234,18 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN,
upperpath->dentry, true, false);
if (err)
- pr_err("overlayfs: failed to verify index dir 'origin' xattr\n");
+ pr_err("failed to verify index dir 'origin' xattr\n");
}
err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true);
if (err)
- pr_err("overlayfs: failed to verify index dir 'upper' xattr\n");
+ pr_err("failed to verify index dir 'upper' xattr\n");
/* Cleanup bad/stale/orphan index entries */
if (!err)
err = ovl_indexdir_cleanup(ofs);
}
if (err || !ofs->indexdir)
- pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+ pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
out:
mnt_drop_write(mnt);
@@ -1258,7 +1259,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
if (!ofs->config.nfs_export && !ofs->upper_mnt)
return true;
- for (i = 0; i < ofs->numlowerfs; i++) {
+ for (i = 0; i < ofs->numfs; i++) {
/*
* We use uuid to associate an overlay lower file handle with a
* lower layer, so we can accept lower fs with null uuid as long
@@ -1266,8 +1267,9 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
* if we detect multiple lower fs with the same uuid, we
* disable lower file handle decoding on all of them.
*/
- if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) {
- ofs->lower_fs[i].bad_uuid = true;
+ if (ofs->fs[i].is_lower &&
+ uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) {
+ ofs->fs[i].bad_uuid = true;
return false;
}
}
@@ -1283,13 +1285,9 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
int err;
bool bad_uuid = false;
- /* fsid 0 is reserved for upper fs even with non upper overlay */
- if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb)
- return 0;
-
- for (i = 0; i < ofs->numlowerfs; i++) {
- if (ofs->lower_fs[i].sb == sb)
- return i + 1;
+ for (i = 0; i < ofs->numfs; i++) {
+ if (ofs->fs[i].sb == sb)
+ return i;
}
if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
@@ -1297,7 +1295,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
if (ofs->config.index || ofs->config.nfs_export) {
ofs->config.index = false;
ofs->config.nfs_export = false;
- pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
+ pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
uuid_is_null(&sb->s_uuid) ? "null" :
"conflicting",
path->dentry);
@@ -1306,35 +1304,59 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
err = get_anon_bdev(&dev);
if (err) {
- pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
+ pr_err("failed to get anonymous bdev for lowerpath\n");
return err;
}
- ofs->lower_fs[ofs->numlowerfs].sb = sb;
- ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev;
- ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid;
- ofs->numlowerfs++;
+ ofs->fs[ofs->numfs].sb = sb;
+ ofs->fs[ofs->numfs].pseudo_dev = dev;
+ ofs->fs[ofs->numfs].bad_uuid = bad_uuid;
- return ofs->numlowerfs;
+ return ofs->numfs++;
}
-static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs,
- struct path *stack, unsigned int numlower)
+static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
+ struct path *stack, unsigned int numlower)
{
int err;
unsigned int i;
+ struct ovl_layer *layers;
err = -ENOMEM;
- ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer),
- GFP_KERNEL);
- if (ofs->lower_layers == NULL)
+ layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
+ if (!layers)
goto out;
+ ofs->layers = layers;
- ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb),
- GFP_KERNEL);
- if (ofs->lower_fs == NULL)
+ ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL);
+ if (ofs->fs == NULL)
goto out;
+ /* idx/fsid 0 are reserved for upper fs even with lower only overlay */
+ ofs->numfs++;
+
+ layers[0].mnt = ofs->upper_mnt;
+ layers[0].idx = 0;
+ layers[0].fsid = 0;
+ ofs->numlayer = 1;
+
+ /*
+ * All lower layers that share the same fs as upper layer, use the same
+ * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower
+ * only overlay to simplify ovl_fs_free().
+ * is_lower will be set if upper fs is shared with a lower layer.
+ */
+ err = get_anon_bdev(&ofs->fs[0].pseudo_dev);
+ if (err) {
+ pr_err("failed to get anonymous bdev for upper fs\n");
+ goto out;
+ }
+
+ if (ofs->upper_mnt) {
+ ofs->fs[0].sb = ofs->upper_mnt->mnt_sb;
+ ofs->fs[0].is_lower = false;
+ }
+
for (i = 0; i < numlower; i++) {
struct vfsmount *mnt;
struct inode *trap;
@@ -1357,7 +1379,7 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs,
mnt = clone_private_mount(&stack[i]);
err = PTR_ERR(mnt);
if (IS_ERR(mnt)) {
- pr_err("overlayfs: failed to clone lowerpath\n");
+ pr_err("failed to clone lowerpath\n");
iput(trap);
goto out;
}
@@ -1368,15 +1390,13 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs,
*/
mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
- ofs->lower_layers[ofs->numlower].trap = trap;
- ofs->lower_layers[ofs->numlower].mnt = mnt;
- ofs->lower_layers[ofs->numlower].idx = i + 1;
- ofs->lower_layers[ofs->numlower].fsid = fsid;
- if (fsid) {
- ofs->lower_layers[ofs->numlower].fs =
- &ofs->lower_fs[fsid - 1];
- }
- ofs->numlower++;
+ layers[ofs->numlayer].trap = trap;
+ layers[ofs->numlayer].mnt = mnt;
+ layers[ofs->numlayer].idx = ofs->numlayer;
+ layers[ofs->numlayer].fsid = fsid;
+ layers[ofs->numlayer].fs = &ofs->fs[fsid];
+ ofs->numlayer++;
+ ofs->fs[fsid].is_lower = true;
}
/*
@@ -1387,22 +1407,23 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs,
* bits reserved for fsid, it emits a warning and uses the original
* inode number.
*/
- if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) {
- ofs->xino_bits = 0;
- ofs->config.xino = OVL_XINO_OFF;
- } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) {
+ if (ofs->numfs - !ofs->upper_mnt == 1) {
+ if (ofs->config.xino == OVL_XINO_ON)
+ pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n");
+ ofs->xino_mode = 0;
+ } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) {
/*
- * This is a roundup of number of bits needed for numlowerfs+1
- * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for
- * upper fs even with non upper overlay.
+ * This is a roundup of number of bits needed for encoding
+ * fsid, where fsid 0 is reserved for upper fs even with
+ * lower only overlay.
*/
BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31);
- ofs->xino_bits = ilog2(ofs->numlowerfs) + 1;
+ ofs->xino_mode = ilog2(ofs->numfs - 1) + 1;
}
- if (ofs->xino_bits) {
- pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n",
- ofs->xino_bits);
+ if (ofs->xino_mode > 0) {
+ pr_info("\"xino\" feature enabled using %d upper inode bits.\n",
+ ofs->xino_mode);
}
err = 0;
@@ -1428,15 +1449,15 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
err = -EINVAL;
stacklen = ovl_split_lowerdirs(lowertmp);
if (stacklen > OVL_MAX_STACK) {
- pr_err("overlayfs: too many lower directories, limit is %d\n",
+ pr_err("too many lower directories, limit is %d\n",
OVL_MAX_STACK);
goto out_err;
} else if (!ofs->config.upperdir && stacklen == 1) {
- pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+ pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n");
goto out_err;
} else if (!ofs->config.upperdir && ofs->config.nfs_export &&
ofs->config.redirect_follow) {
- pr_warn("overlayfs: NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
+ pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
@@ -1459,11 +1480,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
err = -EINVAL;
sb->s_stack_depth++;
if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
- pr_err("overlayfs: maximum fs stacking depth exceeded\n");
+ pr_err("maximum fs stacking depth exceeded\n");
goto out_err;
}
- err = ovl_get_lower_layers(sb, ofs, stack, numlower);
+ err = ovl_get_layers(sb, ofs, stack, numlower);
if (err)
goto out_err;
@@ -1474,7 +1495,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
for (i = 0; i < numlower; i++) {
oe->lowerstack[i].dentry = dget(stack[i].dentry);
- oe->lowerstack[i].layer = &ofs->lower_layers[i];
+ oe->lowerstack[i].layer = &ofs->layers[i+1];
}
if (remote)
@@ -1515,7 +1536,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
while (!err && parent != next) {
if (ovl_lookup_trap_inode(sb, parent)) {
err = -ELOOP;
- pr_err("overlayfs: overlapping %s path\n", name);
+ pr_err("overlapping %s path\n", name);
} else if (ovl_is_inuse(parent)) {
err = ovl_report_in_use(ofs, name);
}
@@ -1555,9 +1576,9 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
return err;
}
- for (i = 0; i < ofs->numlower; i++) {
+ for (i = 1; i < ofs->numlayer; i++) {
err = ovl_check_layer(sb, ofs,
- ofs->lower_layers[i].mnt->mnt_root,
+ ofs->layers[i].mnt->mnt_root,
"lowerdir");
if (err)
return err;
@@ -1595,7 +1616,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
err = -EINVAL;
if (!ofs->config.lowerdir) {
if (!silent)
- pr_err("overlayfs: missing 'lowerdir'\n");
+ pr_err("missing 'lowerdir'\n");
goto out_err;
}
@@ -1603,14 +1624,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = MAX_LFS_FILESIZE;
/* Assume underlaying fs uses 32bit inodes unless proven otherwise */
if (ofs->config.xino != OVL_XINO_OFF)
- ofs->xino_bits = BITS_PER_LONG - 32;
+ ofs->xino_mode = BITS_PER_LONG - 32;
/* alloc/destroy_inode needed for setting up traps in inode cache */
sb->s_op = &ovl_super_operations;
if (ofs->config.upperdir) {
if (!ofs->config.workdir) {
- pr_err("overlayfs: missing 'workdir'\n");
+ pr_err("missing 'workdir'\n");
goto out_err;
}
@@ -1660,13 +1681,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ofs->indexdir) {
ofs->config.index = false;
if (ofs->upper_mnt && ofs->config.nfs_export) {
- pr_warn("overlayfs: NFS export requires an index dir, falling back to nfs_export=off.\n");
+ pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
}
if (ofs->config.metacopy && ofs->config.nfs_export) {
- pr_warn("overlayfs: NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n");
+ pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
@@ -1749,9 +1770,15 @@ static int __init ovl_init(void)
if (ovl_inode_cachep == NULL)
return -ENOMEM;
- err = register_filesystem(&ovl_fs_type);
- if (err)
- kmem_cache_destroy(ovl_inode_cachep);
+ err = ovl_aio_request_cache_init();
+ if (!err) {
+ err = register_filesystem(&ovl_fs_type);
+ if (!err)
+ return 0;
+
+ ovl_aio_request_cache_destroy();
+ }
+ kmem_cache_destroy(ovl_inode_cachep);
return err;
}
@@ -1766,7 +1793,7 @@ static void __exit ovl_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(ovl_inode_cachep);
-
+ ovl_aio_request_cache_destroy();
}
module_init(ovl_init);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index f5678a3f8350..ea005085803f 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -40,18 +40,6 @@ const struct cred *ovl_override_creds(struct super_block *sb)
return override_creds(ofs->creator_cred);
}
-struct super_block *ovl_same_sb(struct super_block *sb)
-{
- struct ovl_fs *ofs = sb->s_fs_info;
-
- if (!ofs->numlowerfs)
- return ofs->upper_mnt->mnt_sb;
- else if (ofs->numlowerfs == 1 && !ofs->upper_mnt)
- return ofs->lower_fs[0].sb;
- else
- return NULL;
-}
-
/*
* Check if underlying fs supports file handles and try to determine encoding
* type, in order to deduce maximum inode number used by fs.
@@ -198,7 +186,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry)
return oe->numlower ? oe->lowerstack[0].dentry : NULL;
}
-struct ovl_layer *ovl_layer_lower(struct dentry *dentry)
+const struct ovl_layer *ovl_layer_lower(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -576,7 +564,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
err = ovl_do_setxattr(upperdentry, name, value, size, 0);
if (err == -EOPNOTSUPP) {
- pr_warn("overlayfs: cannot set %s xattr on upper\n", name);
+ pr_warn("cannot set %s xattr on upper\n", name);
ofs->noxattr = true;
return xerr;
}
@@ -700,7 +688,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
inode = d_inode(upperdentry);
if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) {
- pr_warn_ratelimited("overlayfs: cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
+ pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
upperdentry, inode->i_ino, inode->i_nlink);
/*
* We either have a bug with persistent union nlink or a lower
@@ -739,7 +727,7 @@ out:
return;
fail:
- pr_err("overlayfs: cleanup index of '%pd2' failed (%i)\n", dentry, err);
+ pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err);
goto out;
}
@@ -830,7 +818,7 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
err_unlock:
unlock_rename(workdir, upperdir);
err:
- pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ pr_err("failed to lock workdir+upperdir\n");
return -EIO;
}
@@ -852,7 +840,7 @@ int ovl_check_metacopy_xattr(struct dentry *dentry)
return 1;
out:
- pr_warn_ratelimited("overlayfs: failed to get metacopy (%i)\n", res);
+ pr_warn_ratelimited("failed to get metacopy (%i)\n", res);
return res;
}
@@ -899,7 +887,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
return res;
fail:
- pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n",
+ pr_warn_ratelimited("failed to get xattr %s: err=%zi)\n",
name, res);
kfree(buf);
return res;
@@ -931,7 +919,7 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
return buf;
invalid:
- pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
+ pr_warn_ratelimited("invalid redirect (%s)\n", buf);
res = -EINVAL;
kfree(buf);
return ERR_PTR(res);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ead487e80510..bd08616ed8ba 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -33,3 +33,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
new file mode 100644
index 000000000000..9955d75c0585
--- /dev/null
+++ b/fs/proc/bootconfig.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/bootconfig - Extra boot configuration
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/bootconfig.h>
+#include <linux/slab.h>
+
+static char *saved_boot_config;
+
+static int boot_config_proc_show(struct seq_file *m, void *v)
+{
+ if (saved_boot_config)
+ seq_puts(m, saved_boot_config);
+ return 0;
+}
+
+/* Rest size of buffer */
+#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0)
+
+/* Return the needed total length if @size is 0 */
+static int __init copy_xbc_key_value_list(char *dst, size_t size)
+{
+ struct xbc_node *leaf, *vnode;
+ const char *val;
+ char *key, *end = dst + size;
+ int ret = 0;
+
+ key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+
+ xbc_for_each_key_value(leaf, val) {
+ ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
+ if (ret < 0)
+ break;
+ ret = snprintf(dst, rest(dst, end), "%s = ", key);
+ if (ret < 0)
+ break;
+ dst += ret;
+ vnode = xbc_node_get_child(leaf);
+ if (vnode && xbc_node_is_array(vnode)) {
+ xbc_array_for_each_value(vnode, val) {
+ ret = snprintf(dst, rest(dst, end), "\"%s\"%s",
+ val, vnode->next ? ", " : "\n");
+ if (ret < 0)
+ goto out;
+ dst += ret;
+ }
+ } else {
+ ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val);
+ if (ret < 0)
+ break;
+ dst += ret;
+ }
+ }
+out:
+ kfree(key);
+
+ return ret < 0 ? ret : dst - (end - size);
+}
+
+static int __init proc_boot_config_init(void)
+{
+ int len;
+
+ len = copy_xbc_key_value_list(NULL, 0);
+ if (len < 0)
+ return len;
+
+ if (len > 0) {
+ saved_boot_config = kzalloc(len + 1, GFP_KERNEL);
+ if (!saved_boot_config)
+ return -ENOMEM;
+
+ len = copy_xbc_key_value_list(saved_boot_config, len + 1);
+ if (len < 0) {
+ kfree(saved_boot_config);
+ return len;
+ }
+ }
+
+ proc_create_single("bootconfig", 0, NULL, boot_config_proc_show);
+
+ return 0;
+}
+fs_initcall(proc_boot_config_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 96f1087e372c..c1dea9b8222e 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -16,16 +16,16 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
return seq_open(file, &cpuinfo_op);
}
-static const struct file_operations proc_cpuinfo_operations = {
- .open = cpuinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops cpuinfo_proc_ops = {
+ .proc_open = cpuinfo_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int __init proc_cpuinfo_init(void)
{
- proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+ proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops);
return 0;
}
fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 074e9585c699..3faed94e4b65 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -473,7 +473,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
if (ent) {
ent->data = data;
- ent->proc_fops = &proc_dir_operations;
+ ent->proc_dir_ops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
ent = proc_register(parent, ent);
}
@@ -503,7 +503,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent = __proc_create(&parent, name, mode, 2);
if (ent) {
ent->data = NULL;
- ent->proc_fops = NULL;
+ ent->proc_dir_ops = NULL;
ent->proc_iops = NULL;
ent = proc_register(parent, ent);
}
@@ -533,25 +533,23 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
- const struct file_operations *proc_fops, void *data)
+ const struct proc_ops *proc_ops, void *data)
{
struct proc_dir_entry *p;
- BUG_ON(proc_fops == NULL);
-
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
- p->proc_fops = proc_fops;
+ p->proc_ops = proc_ops;
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
struct proc_dir_entry *proc_create(const char *name, umode_t mode,
struct proc_dir_entry *parent,
- const struct file_operations *proc_fops)
+ const struct proc_ops *proc_ops)
{
- return proc_create_data(name, mode, parent, proc_fops, NULL);
+ return proc_create_data(name, mode, parent, proc_ops, NULL);
}
EXPORT_SYMBOL(proc_create);
@@ -573,11 +571,11 @@ static int proc_seq_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static const struct file_operations proc_seq_fops = {
- .open = proc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = proc_seq_release,
+static const struct proc_ops proc_seq_ops = {
+ .proc_open = proc_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = proc_seq_release,
};
struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
@@ -589,7 +587,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
- p->proc_fops = &proc_seq_fops;
+ p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
return proc_register(parent, p);
@@ -603,11 +601,11 @@ static int proc_single_open(struct inode *inode, struct file *file)
return single_open(file, de->single_show, de->data);
}
-static const struct file_operations proc_single_fops = {
- .open = proc_single_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops proc_single_ops = {
+ .proc_open = proc_single_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
@@ -619,7 +617,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
- p->proc_fops = &proc_single_fops;
+ p->proc_ops = &proc_single_ops;
p->single_show = show;
return proc_register(parent, p);
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index dbe43a50caf2..6da18316d209 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -163,7 +163,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
file = pdeo->file;
- pde->proc_fops->release(file_inode(file), file);
+ pde->proc_ops->proc_release(file_inode(file), file);
spin_lock(&pde->pde_unload_lock);
/* After ->release. */
list_del(&pdeo->lh);
@@ -200,12 +200,12 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
if (use_pde(pde)) {
- typeof_member(struct file_operations, llseek) llseek;
+ typeof_member(struct proc_ops, proc_lseek) lseek;
- llseek = pde->proc_fops->llseek;
- if (!llseek)
- llseek = default_llseek;
- rv = llseek(file, offset, whence);
+ lseek = pde->proc_ops->proc_lseek;
+ if (!lseek)
+ lseek = default_llseek;
+ rv = lseek(file, offset, whence);
unuse_pde(pde);
}
return rv;
@@ -216,9 +216,9 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
if (use_pde(pde)) {
- typeof_member(struct file_operations, read) read;
+ typeof_member(struct proc_ops, proc_read) read;
- read = pde->proc_fops->read;
+ read = pde->proc_ops->proc_read;
if (read)
rv = read(file, buf, count, ppos);
unuse_pde(pde);
@@ -231,9 +231,9 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
if (use_pde(pde)) {
- typeof_member(struct file_operations, write) write;
+ typeof_member(struct proc_ops, proc_write) write;
- write = pde->proc_fops->write;
+ write = pde->proc_ops->proc_write;
if (write)
rv = write(file, buf, count, ppos);
unuse_pde(pde);
@@ -246,9 +246,9 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
struct proc_dir_entry *pde = PDE(file_inode(file));
__poll_t rv = DEFAULT_POLLMASK;
if (use_pde(pde)) {
- typeof_member(struct file_operations, poll) poll;
+ typeof_member(struct proc_ops, proc_poll) poll;
- poll = pde->proc_fops->poll;
+ poll = pde->proc_ops->proc_poll;
if (poll)
rv = poll(file, pts);
unuse_pde(pde);
@@ -261,9 +261,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
if (use_pde(pde)) {
- typeof_member(struct file_operations, unlocked_ioctl) ioctl;
+ typeof_member(struct proc_ops, proc_ioctl) ioctl;
- ioctl = pde->proc_fops->unlocked_ioctl;
+ ioctl = pde->proc_ops->proc_ioctl;
if (ioctl)
rv = ioctl(file, cmd, arg);
unuse_pde(pde);
@@ -277,9 +277,9 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
if (use_pde(pde)) {
- typeof_member(struct file_operations, compat_ioctl) compat_ioctl;
+ typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
- compat_ioctl = pde->proc_fops->compat_ioctl;
+ compat_ioctl = pde->proc_ops->proc_compat_ioctl;
if (compat_ioctl)
rv = compat_ioctl(file, cmd, arg);
unuse_pde(pde);
@@ -293,9 +293,9 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
if (use_pde(pde)) {
- typeof_member(struct file_operations, mmap) mmap;
+ typeof_member(struct proc_ops, proc_mmap) mmap;
- mmap = pde->proc_fops->mmap;
+ mmap = pde->proc_ops->proc_mmap;
if (mmap)
rv = mmap(file, vma);
unuse_pde(pde);
@@ -312,9 +312,9 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
unsigned long rv = -EIO;
if (use_pde(pde)) {
- typeof_member(struct file_operations, get_unmapped_area) get_area;
+ typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
- get_area = pde->proc_fops->get_unmapped_area;
+ get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
if (!get_area)
get_area = current->mm->get_unmapped_area;
@@ -333,8 +333,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
- typeof_member(struct file_operations, open) open;
- typeof_member(struct file_operations, release) release;
+ typeof_member(struct proc_ops, proc_open) open;
+ typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
/*
@@ -351,7 +351,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (!use_pde(pde))
return -ENOENT;
- release = pde->proc_fops->release;
+ release = pde->proc_ops->proc_release;
if (release) {
pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
if (!pdeo) {
@@ -360,7 +360,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
}
}
- open = pde->proc_fops->open;
+ open = pde->proc_ops->proc_open;
if (open)
rv = open(inode, file);
@@ -468,21 +468,23 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_size = de->size;
if (de->nlink)
set_nlink(inode, de->nlink);
- WARN_ON(!de->proc_iops);
- inode->i_op = de->proc_iops;
- if (de->proc_fops) {
- if (S_ISREG(inode->i_mode)) {
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (!de->proc_fops->compat_ioctl)
- inode->i_fop =
- &proc_reg_file_ops_no_compat;
- else
-#endif
- inode->i_fop = &proc_reg_file_ops;
- } else {
- inode->i_fop = de->proc_fops;
+ if (!de->proc_ops->proc_compat_ioctl) {
+ inode->i_fop = &proc_reg_file_ops_no_compat;
}
- }
+#endif
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = de->proc_dir_ops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = NULL;
+ } else
+ BUG();
} else
pde_put(de);
return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0f3b557c9b77..41587276798e 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -39,7 +39,10 @@ struct proc_dir_entry {
spinlock_t pde_unload_lock;
struct completion *pde_unload_completion;
const struct inode_operations *proc_iops;
- const struct file_operations *proc_fops;
+ union {
+ const struct proc_ops *proc_ops;
+ const struct file_operations *proc_dir_ops;
+ };
const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e2ed8e08cc7a..8ba492d44e68 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -574,11 +574,11 @@ static int release_kcore(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations proc_kcore_operations = {
- .read = read_kcore,
- .open = open_kcore,
- .release = release_kcore,
- .llseek = default_llseek,
+static const struct proc_ops kcore_proc_ops = {
+ .proc_read = read_kcore,
+ .proc_open = open_kcore,
+ .proc_release = release_kcore,
+ .proc_lseek = default_llseek,
};
/* just remember that we have to update kcore */
@@ -637,8 +637,7 @@ static void __init add_modules_range(void)
static int __init proc_kcore_init(void)
{
- proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
- &proc_kcore_operations);
+ proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops);
if (!proc_root_kcore) {
pr_err("couldn't create /proc/kcore\n");
return 0; /* Always returns 0. */
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 4f4a2abb225e..ec1b7d2fb773 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -49,17 +49,17 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait)
}
-static const struct file_operations proc_kmsg_operations = {
- .read = kmsg_read,
- .poll = kmsg_poll,
- .open = kmsg_open,
- .release = kmsg_release,
- .llseek = generic_file_llseek,
+static const struct proc_ops kmsg_proc_ops = {
+ .proc_read = kmsg_read,
+ .proc_poll = kmsg_poll,
+ .proc_open = kmsg_open,
+ .proc_release = kmsg_release,
+ .proc_lseek = generic_file_llseek,
};
static int __init proc_kmsg_init(void)
{
- proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+ proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops);
return 0;
}
fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7c952ee732e6..f909243d4a66 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -21,6 +21,21 @@
#define KPMMASK (KPMSIZE - 1)
#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+static inline unsigned long get_max_dump_pfn(void)
+{
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * The memmap of early sections is completely populated and marked
+ * online even if max_pfn does not fall on a section boundary -
+ * pfn_to_online_page() will succeed on all pages. Allow inspecting
+ * these memmaps.
+ */
+ return round_up(max_pfn, PAGES_PER_SECTION);
+#else
+ return max_pfn;
+#endif
+}
+
/* /proc/kpagecount - an array exposing page counts
*
* Each entry is a u64 representing the corresponding
@@ -29,6 +44,7 @@
static ssize_t kpagecount_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -37,9 +53,11 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
u64 pcount;
pfn = src / KPMSIZE;
- count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -71,9 +89,9 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return ret;
}
-static const struct file_operations proc_kpagecount_operations = {
- .llseek = mem_lseek,
- .read = kpagecount_read,
+static const struct proc_ops kpagecount_proc_ops = {
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecount_read,
};
/* /proc/kpageflags - an array exposing page flags
@@ -206,6 +224,7 @@ u64 stable_page_flags(struct page *page)
static ssize_t kpageflags_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -213,9 +232,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
ssize_t ret = 0;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -242,15 +263,16 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
return ret;
}
-static const struct file_operations proc_kpageflags_operations = {
- .llseek = mem_lseek,
- .read = kpageflags_read,
+static const struct proc_ops kpageflags_proc_ops = {
+ .proc_lseek = mem_lseek,
+ .proc_read = kpageflags_read,
};
#ifdef CONFIG_MEMCG
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -259,9 +281,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
u64 ino;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -293,18 +317,18 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
return ret;
}
-static const struct file_operations proc_kpagecgroup_operations = {
- .llseek = mem_lseek,
- .read = kpagecgroup_read,
+static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecgroup_read,
};
#endif /* CONFIG_MEMCG */
static int __init proc_page_init(void)
{
- proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
- proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+ proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops);
+ proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops);
#ifdef CONFIG_MEMCG
- proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+ proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops);
#endif
return 0;
}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 76ae278df1c4..4888c5224442 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -90,12 +90,12 @@ static int seq_release_net(struct inode *ino, struct file *f)
return 0;
}
-static const struct file_operations proc_net_seq_fops = {
- .open = seq_open_net,
- .read = seq_read,
- .write = proc_simple_write,
- .llseek = seq_lseek,
- .release = seq_release_net,
+static const struct proc_ops proc_net_seq_ops = {
+ .proc_open = seq_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_net,
};
struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
@@ -108,7 +108,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_seq_fops;
+ p->proc_ops = &proc_net_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
return proc_register(parent, p);
@@ -152,7 +152,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_seq_fops;
+ p->proc_ops = &proc_net_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
p->write = write;
@@ -183,12 +183,12 @@ static int single_release_net(struct inode *ino, struct file *f)
return single_release(ino, f);
}
-static const struct file_operations proc_net_single_fops = {
- .open = single_open_net,
- .read = seq_read,
- .write = proc_simple_write,
- .llseek = seq_lseek,
- .release = single_release_net,
+static const struct proc_ops proc_net_single_ops = {
+ .proc_open = single_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release_net,
};
struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
@@ -201,7 +201,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_single_fops;
+ p->proc_ops = &proc_net_single_ops;
p->single_show = show;
return proc_register(parent, p);
}
@@ -244,7 +244,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_single_fops;
+ p->proc_ops = &proc_net_single_ops;
p->single_show = show;
p->write = write;
return proc_register(parent, p);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d80989b6c344..c75bb4632ed1 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1720,7 +1720,7 @@ int __init proc_sys_init(void)
proc_sys_root = proc_mkdir("sys", NULL);
proc_sys_root->proc_iops = &proc_sys_dir_operations;
- proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
+ proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
return sysctl_init();
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 0b7c8dffc9ae..72c07a34cff0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -292,7 +292,7 @@ struct proc_dir_entry proc_root = {
.nlink = 2,
.refcnt = REFCOUNT_INIT(1),
.proc_iops = &proc_root_inode_operations,
- .proc_fops = &proc_root_operations,
+ .proc_dir_ops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT,
.name = "/proc",
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index fd931d3e77be..0449edf460f5 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -223,16 +223,16 @@ static int stat_open(struct inode *inode, struct file *file)
return single_open_size(file, show_stat, NULL, size);
}
-static const struct file_operations proc_stat_operations = {
- .open = stat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops stat_proc_ops = {
+ .proc_open = stat_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
static int __init proc_stat_init(void)
{
- proc_create("stat", 0, NULL, &proc_stat_operations);
+ proc_create("stat", 0, NULL, &stat_proc_ops);
return 0;
}
fs_initcall(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9442631fd4af..3ba9ae83bff5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -505,7 +505,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
#ifdef CONFIG_SHMEM
static int smaps_pte_hole(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
@@ -1282,7 +1282,7 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
}
static int pagemap_pte_hole(unsigned long start, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
unsigned long addr = start;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7b13988796e1..7dc800cce354 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -667,10 +667,10 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
}
#endif
-static const struct file_operations proc_vmcore_operations = {
- .read = read_vmcore,
- .llseek = default_llseek,
- .mmap = mmap_vmcore,
+static const struct proc_ops vmcore_proc_ops = {
+ .proc_read = read_vmcore,
+ .proc_lseek = default_llseek,
+ .proc_mmap = mmap_vmcore,
};
static struct vmcore* __init get_new_element(void)
@@ -1555,7 +1555,7 @@ static int __init vmcore_init(void)
elfcorehdr_free(elfcorehdr_addr);
elfcorehdr_addr = ELFCORE_ADDR_ERR;
- proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
+ proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops);
if (proc_vmcore)
proc_vmcore->size = vmcore_size;
return 0;
diff --git a/fs/read_write.c b/fs/read_write.c
index 7458fccc59e1..59d819c5b92e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -939,6 +939,34 @@ out:
return ret;
}
+ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ size_t tot_len;
+ ssize_t ret = 0;
+
+ if (!file->f_op->read_iter)
+ return -EINVAL;
+ if (!(file->f_mode & FMODE_READ))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_READ))
+ return -EINVAL;
+
+ tot_len = iov_iter_count(iter);
+ if (!tot_len)
+ goto out;
+ ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
+ if (ret < 0)
+ return ret;
+
+ ret = call_read_iter(file, iocb, iter);
+out:
+ if (ret >= 0)
+ fsnotify_access(file);
+ return ret;
+}
+EXPORT_SYMBOL(vfs_iocb_iter_read);
+
ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
rwf_t flags)
{
@@ -975,6 +1003,34 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
return ret;
}
+ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ size_t tot_len;
+ ssize_t ret = 0;
+
+ if (!file->f_op->write_iter)
+ return -EINVAL;
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_WRITE))
+ return -EINVAL;
+
+ tot_len = iov_iter_count(iter);
+ if (!tot_len)
+ return 0;
+ ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
+ if (ret < 0)
+ return ret;
+
+ ret = call_write_iter(file, iocb, iter);
+ if (ret > 0)
+ fsnotify_modify(file);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_iocb_iter_write);
+
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
rwf_t flags)
{
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index d41c21fef138..c4ab045926b7 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -449,7 +449,7 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
}
link = kernfs_create_link(kobj->sd, target_name, entry);
- if (IS_ERR(link) && PTR_ERR(link) == -EEXIST)
+ if (PTR_ERR(link) == -EEXIST)
sysfs_warn_dup(kobj->sd, target_name);
kernfs_put(entry);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 0caa151cae4e..0ee8c6dfb036 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -330,7 +330,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
parent = tracefs_mount->mnt_root;
inode_lock(parent->d_inode);
- dentry = lookup_one_len(name, parent, strlen(name));
+ if (unlikely(IS_DEADDIR(parent->d_inode)))
+ dentry = ERR_PTR(-ENOENT);
+ else
+ dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry) && dentry->d_inode) {
dput(dentry);
dentry = ERR_PTR(-EEXIST);
@@ -499,122 +502,27 @@ __init struct dentry *tracefs_create_instance_dir(const char *name,
return dentry;
}
-static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+static void remove_one(struct dentry *victim)
{
- int ret = 0;
-
- if (simple_positive(dentry)) {
- if (dentry->d_inode) {
- dget(dentry);
- switch (dentry->d_inode->i_mode & S_IFMT) {
- case S_IFDIR:
- ret = simple_rmdir(parent->d_inode, dentry);
- if (!ret)
- fsnotify_rmdir(parent->d_inode, dentry);
- break;
- default:
- simple_unlink(parent->d_inode, dentry);
- fsnotify_unlink(parent->d_inode, dentry);
- break;
- }
- if (!ret)
- d_delete(dentry);
- dput(dentry);
- }
- }
- return ret;
-}
-
-/**
- * tracefs_remove - removes a file or directory from the tracefs filesystem
- * @dentry: a pointer to a the dentry of the file or directory to be
- * removed.
- *
- * This function removes a file or directory in tracefs that was previously
- * created with a call to another tracefs function (like
- * tracefs_create_file() or variants thereof.)
- */
-void tracefs_remove(struct dentry *dentry)
-{
- struct dentry *parent;
- int ret;
-
- if (IS_ERR_OR_NULL(dentry))
- return;
-
- parent = dentry->d_parent;
- inode_lock(parent->d_inode);
- ret = __tracefs_remove(dentry, parent);
- inode_unlock(parent->d_inode);
- if (!ret)
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
}
/**
- * tracefs_remove_recursive - recursively removes a directory
+ * tracefs_remove - recursively removes a directory
* @dentry: a pointer to a the dentry of the directory to be removed.
*
* This function recursively removes a directory tree in tracefs that
* was previously created with a call to another tracefs function
* (like tracefs_create_file() or variants thereof.)
*/
-void tracefs_remove_recursive(struct dentry *dentry)
+void tracefs_remove(struct dentry *dentry)
{
- struct dentry *child, *parent;
-
if (IS_ERR_OR_NULL(dentry))
return;
- parent = dentry;
- down:
- inode_lock(parent->d_inode);
- loop:
- /*
- * The parent->d_subdirs is protected by the d_lock. Outside that
- * lock, the child can be unlinked and set to be freed which can
- * use the d_u.d_child as the rcu head and corrupt this list.
- */
- spin_lock(&parent->d_lock);
- list_for_each_entry(child, &parent->d_subdirs, d_child) {
- if (!simple_positive(child))
- continue;
-
- /* perhaps simple_empty(child) makes more sense */
- if (!list_empty(&child->d_subdirs)) {
- spin_unlock(&parent->d_lock);
- inode_unlock(parent->d_inode);
- parent = child;
- goto down;
- }
-
- spin_unlock(&parent->d_lock);
-
- if (!__tracefs_remove(child, parent))
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-
- /*
- * The parent->d_lock protects agaist child from unlinking
- * from d_subdirs. When releasing the parent->d_lock we can
- * no longer trust that the next pointer is valid.
- * Restart the loop. We'll skip this one with the
- * simple_positive() check.
- */
- goto loop;
- }
- spin_unlock(&parent->d_lock);
-
- inode_unlock(parent->d_inode);
- child = parent;
- parent = parent->d_parent;
- inode_lock(parent->d_inode);
-
- if (child != dentry)
- /* go up */
- goto loop;
-
- if (!__tracefs_remove(child, parent))
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- inode_unlock(parent->d_inode);
+ simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count);
+ simple_recursive_removal(dentry, remove_one);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
}
/**
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bc4dec5b1633..743928efffc1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1080,18 +1080,12 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
inode->i_uid = attr->ia_uid;
if (attr->ia_valid & ATTR_GID)
inode->i_gid = attr->ia_gid;
- if (attr->ia_valid & ATTR_ATIME) {
- inode->i_atime = timestamp_truncate(attr->ia_atime,
- inode);
- }
- if (attr->ia_valid & ATTR_MTIME) {
- inode->i_mtime = timestamp_truncate(attr->ia_mtime,
- inode);
- }
- if (attr->ia_valid & ATTR_CTIME) {
- inode->i_ctime = timestamp_truncate(attr->ia_ctime,
- inode);
- }
+ if (attr->ia_valid & ATTR_ATIME)
+ inode->i_atime = attr->ia_atime;
+ if (attr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = attr->ia_mtime;
+ if (attr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = attr->ia_ctime;
if (attr->ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 17c90dff7266..4b4b65b48c57 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -84,7 +84,6 @@ static int create_default_filesystem(struct ubifs_info *c)
int idx_node_size;
long long tmp64, main_bytes;
__le64 tmp_le64;
- __le32 tmp_le32;
struct timespec64 ts;
u8 hash[UBIFS_HASH_ARR_SZ];
u8 hash_lpt[UBIFS_HASH_ARR_SZ];
@@ -291,16 +290,14 @@ static int create_default_filesystem(struct ubifs_info *c)
ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
ino->nlink = cpu_to_le32(2);
- ktime_get_real_ts64(&ts);
- ts = timespec64_trunc(ts, DEFAULT_TIME_GRAN);
+ ktime_get_coarse_real_ts64(&ts);
tmp_le64 = cpu_to_le64(ts.tv_sec);
ino->atime_sec = tmp_le64;
ino->ctime_sec = tmp_le64;
ino->mtime_sec = tmp_le64;
- tmp_le32 = cpu_to_le32(ts.tv_nsec);
- ino->atime_nsec = tmp_le32;
- ino->ctime_nsec = tmp_le32;
- ino->mtime_nsec = tmp_le32;
+ ino->atime_nsec = 0;
+ ino->ctime_nsec = 0;
+ ino->mtime_nsec = 0;
ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/utimes.c b/fs/utimes.c
index c952b6b3d8a0..1d17ce98cb80 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -36,14 +36,14 @@ static int utimes_common(const struct path *path, struct timespec64 *times)
if (times[0].tv_nsec == UTIME_OMIT)
newattrs.ia_valid &= ~ATTR_ATIME;
else if (times[0].tv_nsec != UTIME_NOW) {
- newattrs.ia_atime = timestamp_truncate(times[0], inode);
+ newattrs.ia_atime = times[0];
newattrs.ia_valid |= ATTR_ATIME_SET;
}
if (times[1].tv_nsec == UTIME_OMIT)
newattrs.ia_valid &= ~ATTR_MTIME;
else if (times[1].tv_nsec != UTIME_NOW) {
- newattrs.ia_mtime = timestamp_truncate(times[1], inode);
+ newattrs.ia_mtime = times[1];
newattrs.ia_valid |= ATTR_MTIME_SET;
}
/*
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 14fbdf22b7e7..08d6beb54f8c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -23,25 +23,28 @@
#include "xfs_ag_resv.h"
#include "xfs_health.h"
-static struct xfs_buf *
+static int
xfs_get_aghdr_buf(
struct xfs_mount *mp,
xfs_daddr_t blkno,
size_t numblks,
+ struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
+ int error;
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
- if (!bp)
- return NULL;
+ error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
+ if (error)
+ return error;
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
bp->b_bn = blkno;
bp->b_maps[0].bm_bn = blkno;
bp->b_ops = ops;
- return bp;
+ *bpp = bp;
+ return 0;
}
static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
@@ -340,13 +343,13 @@ xfs_ag_init_hdr(
struct aghdr_init_data *id,
aghdr_init_work_f work,
const struct xfs_buf_ops *ops)
-
{
struct xfs_buf *bp;
+ int error;
- bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
- if (!bp)
- return -ENOMEM;
+ error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops);
+ if (error)
+ return error;
(*work)(mp, bp, id);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index fc93fd88ec89..d8053bc96c4d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1070,11 +1070,11 @@ xfs_alloc_ag_vextent_small(
if (args->datatype & XFS_ALLOC_USERDATA) {
struct xfs_buf *bp;
- bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
- if (XFS_IS_CORRUPT(args->mp, !bp)) {
- error = -EFSCORRUPTED;
+ error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
+ args->mp->m_bsize, 0, &bp);
+ if (error)
goto error;
- }
xfs_trans_binval(args->tp, bp);
}
*fbnop = args->agbno = fbno;
@@ -2347,9 +2347,11 @@ xfs_free_agfl_block(
if (error)
return error;
- bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
- if (XFS_IS_CORRUPT(tp->t_mountp, !bp))
- return -EFSCORRUPTED;
+ error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
+ tp->t_mountp->m_bsize, 0, &bp);
+ if (error)
+ return error;
xfs_trans_binval(tp, bp);
return 0;
@@ -2500,12 +2502,11 @@ xfs_alloc_fix_freelist(
if (!pag->pagf_init) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
- if (error)
+ if (error) {
+ /* Couldn't lock the AGF so skip this AG. */
+ if (error == -EAGAIN)
+ error = 0;
goto out_no_agbp;
- if (!pag->pagf_init) {
- ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
- ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- goto out_agbp_relse;
}
}
@@ -2531,11 +2532,10 @@ xfs_alloc_fix_freelist(
*/
if (!agbp) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
- if (error)
- goto out_no_agbp;
- if (!agbp) {
- ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
- ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+ if (error) {
+ /* Couldn't lock the AGF so skip this AG. */
+ if (error == -EAGAIN)
+ error = 0;
goto out_no_agbp;
}
}
@@ -2766,11 +2766,10 @@ xfs_alloc_pagf_init(
xfs_buf_t *bp;
int error;
- if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
- return error;
- if (bp)
+ error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
+ if (!error)
xfs_trans_brelse(tp, bp);
- return 0;
+ return error;
}
/*
@@ -2956,14 +2955,11 @@ xfs_read_agf(
trace_xfs_read_agf(mp, agno);
ASSERT(agno != NULLAGNUMBER);
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
if (error)
return error;
- if (!*bpp)
- return 0;
ASSERT(!(*bpp)->b_error);
xfs_buf_set_ref(*bpp, XFS_AGF_REF);
@@ -2987,14 +2983,15 @@ xfs_alloc_read_agf(
trace_xfs_alloc_read_agf(mp, agno);
+ /* We don't support trylock when freeing. */
+ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
+ (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
ASSERT(agno != NULLAGNUMBER);
error = xfs_read_agf(mp, tp, agno,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
bpp);
if (error)
return error;
- if (!*bpp)
- return 0;
ASSERT(!(*bpp)->b_error);
agf = XFS_BUF_TO_AGF(*bpp);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index a266d05df146..8b7f74b3bea2 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -418,20 +418,10 @@ xfs_attr_rmtval_get(
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0,
- &xfs_attr3_rmt_buf_ops);
- if (!bp)
- return -ENOMEM;
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
-
- /* bad CRC means corrupted metadata */
- if (error == -EFSBADCRC)
- error = -EFSCORRUPTED;
+ error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
+ 0, &bp, &xfs_attr3_rmt_buf_ops);
+ if (error)
return error;
- }
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
&offset, &valuelen,
@@ -555,9 +545,9 @@ xfs_attr_rmtval_set(
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt);
- if (!bp)
- return -ENOMEM;
+ error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp);
+ if (error)
+ return error;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4c2e046fbfad..9a6d7a84689a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -730,11 +730,11 @@ xfs_bmap_extents_to_btree(
cur->bc_private.b.allocated++;
ip->i_d.di_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
- abp = xfs_btree_get_bufl(mp, tp, args.fsbno);
- if (XFS_IS_CORRUPT(mp, !abp)) {
- error = -EFSCORRUPTED;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, args.fsbno),
+ mp->m_bsize, 0, &abp);
+ if (error)
goto out_unreserve_dquot;
- }
/*
* Fill in the child block.
@@ -878,7 +878,11 @@ xfs_bmap_local_to_extents(
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
tp->t_firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno);
+ error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(args.mp, args.fsbno),
+ args.mp->m_bsize, 0, &bp);
+ if (error)
+ goto done;
/*
* Initialize the block, copy the data and log the remote buffer.
@@ -3307,11 +3311,12 @@ xfs_bmap_longest_free_extent(
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
- if (error)
- goto out;
-
- if (!pag->pagf_init) {
- *notinit = 1;
+ if (error) {
+ /* Couldn't lock the AGF, so skip this AG. */
+ if (error == -EAGAIN) {
+ *notinit = 1;
+ error = 0;
+ }
goto out;
}
}
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b22c7e928eb1..fd300dc93ca4 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -679,42 +679,6 @@ xfs_btree_get_block(
}
/*
- * Get a buffer for the block, return it with no data read.
- * Long-form addressing.
- */
-xfs_buf_t * /* buffer for fsbno */
-xfs_btree_get_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t fsbno) /* file system block number */
-{
- xfs_daddr_t d; /* real disk block address */
-
- ASSERT(fsbno != NULLFSBLOCK);
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
-}
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Short-form addressing.
- */
-xfs_buf_t * /* buffer for agno/agbno */
-xfs_btree_get_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno) /* allocation group block number */
-{
- xfs_daddr_t d; /* real disk block address */
-
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
-}
-
-/*
* Change the cursor to point to the first record at the given level.
* Other levels are unaffected.
*/
@@ -1270,11 +1234,10 @@ xfs_btree_get_buf_block(
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0);
-
- if (!*bpp)
- return -ENOMEM;
+ error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
+ 0, bpp);
+ if (error)
+ return error;
(*bpp)->b_ops = cur->bc_ops->buf_ops;
*block = XFS_BUF_TO_BLOCK(*bpp);
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fb9b2121c628..3eff7c321d43 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -297,27 +297,6 @@ xfs_btree_dup_cursor(
xfs_btree_cur_t **ncur);/* output cursor */
/*
- * Get a buffer for the block, return it with no data read.
- * Long-form addressing.
- */
-struct xfs_buf * /* buffer for fsbno */
-xfs_btree_get_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno); /* file system block number */
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Short-form addressing.
- */
-struct xfs_buf * /* buffer for agno/agbno */
-xfs_btree_get_bufs(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno); /* allocation group block number */
-
-/*
* Compute first and last byte offsets for the fields given.
* Interprets the offsets table, which contains struct field offsets.
*/
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 8c3eafe280ed..875e04f82541 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2591,13 +2591,9 @@ xfs_da_get_buf(
if (error || nmap == 0)
goto out_free;
- bp = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0);
- error = bp ? bp->b_error : -EIO;
- if (error) {
- if (bp)
- xfs_trans_brelse(tp, bp);
+ error = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0, &bp);
+ if (error)
goto out_free;
- }
*bpp = bp;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 5b759af4d165..bf161e930f1d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -276,6 +276,7 @@ xfs_ialloc_inode_init(
int i, j;
xfs_daddr_t d;
xfs_ino_t ino = 0;
+ int error;
/*
* Loop over the new block(s), filling in the inodes. For small block
@@ -327,12 +328,11 @@ xfs_ialloc_inode_init(
*/
d = XFS_AGB_TO_DADDR(mp, agno, agbno +
(j * M_IGEO(mp)->blocks_per_cluster));
- fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize *
- M_IGEO(mp)->blocks_per_cluster,
- XBF_UNMAPPED);
- if (!fbuf)
- return -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
+ XBF_UNMAPPED, &fbuf);
+ if (error)
+ return error;
/* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index d7d702ee4d1a..6e1665f2cb67 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1177,8 +1177,6 @@ xfs_refcount_finish_one(
XFS_ALLOC_FLAG_FREEING, &agbp);
if (error)
return error;
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
- return -EFSCORRUPTED;
rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
if (!rcur) {
@@ -1718,10 +1716,6 @@ xfs_refcount_recover_cow_leftovers(
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
goto out_trans;
- if (!agbp) {
- error = -ENOMEM;
- goto out_trans;
- }
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
/* Find all the leftover CoW staging extents. */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 0ac69751fe85..2f60fc3c99a0 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -985,9 +985,9 @@ xfs_update_secondary_sbs(
for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
struct xfs_buf *bp;
- bp = xfs_buf_get(mp->m_ddev_targp,
+ error = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
- XFS_FSS_TO_BB(mp, 1));
+ XFS_FSS_TO_BB(mp, 1), &bp);
/*
* If we get an error reading or writing alternate superblocks,
* continue. xfs_repair chooses the "best" superblock based
@@ -995,12 +995,12 @@ xfs_update_secondary_sbs(
* superblocks un-updated than updated, and xfs_repair may
* pick them over the properly-updated primary.
*/
- if (!bp) {
+ if (error) {
xfs_warn(mp,
"error allocating secondary superblock for ag %d",
agno);
if (!saved_error)
- saved_error = -ENOMEM;
+ saved_error = error;
continue;
}
@@ -1185,13 +1185,14 @@ xfs_sb_get_secondary(
struct xfs_buf **bpp)
{
struct xfs_buf *bp;
+ int error;
ASSERT(agno != 0 && agno != NULLAGNUMBER);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
- if (!bp)
- return -ENOMEM;
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ if (error)
+ return error;
bp->b_ops = &xfs_sb_buf_ops;
xfs_buf_oneshot(bp);
*bpp = bp;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 7a1a38b636a9..d5e6db9af434 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -659,8 +659,6 @@ xrep_agfl(
error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
if (error)
return error;
- if (!agf_bp)
- return -ENOMEM;
/*
* Make sure we have the AGFL buffer, as scrub might have decided it
@@ -735,8 +733,6 @@ xrep_agi_find_btrees(
error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
if (error)
return error;
- if (!agf_bp)
- return -ENOMEM;
/* Find the btree roots. */
error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL);
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 7251c66a82c9..ec2064ed3c30 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -83,9 +83,6 @@ xchk_fscount_warmup(
error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
if (error)
break;
- error = -ENOMEM;
- if (!agf_bp || !agi_bp)
- break;
/*
* These are supposed to be initialized by the header read
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index b70a88bc975e..e489d7a8446a 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -341,13 +341,17 @@ xrep_init_btblock(
struct xfs_trans *tp = sc->tp;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *bp;
+ int error;
trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), btnum);
ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
- XFS_FSB_TO_BB(mp, 1), 0);
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
+ &bp);
+ if (error)
+ return error;
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
@@ -542,8 +546,6 @@ xrep_reap_block(
error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
if (error)
return error;
- if (!agf_bp)
- return -ENOMEM;
} else {
agf_bp = sc->sa.agf_bp;
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 8fbb841cd6fe..bbfa6ba84dcd 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -205,11 +205,12 @@ xfs_attr3_node_inactive(
/*
* Remove the subsidiary block from the cache and from the log.
*/
- child_bp = xfs_trans_get_buf(*trans, mp->m_ddev_targp,
+ error = xfs_trans_get_buf(*trans, mp->m_ddev_targp,
child_blkno,
- XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0);
- if (!child_bp)
- return -EIO;
+ XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0,
+ &child_bp);
+ if (error)
+ return error;
error = bp->b_error;
if (error) {
xfs_trans_brelse(*trans, child_bp);
@@ -298,10 +299,10 @@ xfs_attr3_root_inactive(
/*
* Invalidate the incore copy of the root block.
*/
- bp = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno,
- XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0);
- if (!bp)
- return -EIO;
+ error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno,
+ XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp);
+ if (error)
+ return error;
error = bp->b_error;
if (error) {
xfs_trans_brelse(*trans, bp);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a0229c368e78..217e4f82a44a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -198,20 +198,22 @@ xfs_buf_free_maps(
}
}
-static struct xfs_buf *
+static int
_xfs_buf_alloc(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
struct xfs_buf *bp;
int error;
int i;
+ *bpp = NULL;
bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
if (unlikely(!bp))
- return NULL;
+ return -ENOMEM;
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -239,7 +241,7 @@ _xfs_buf_alloc(
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
kmem_cache_free(xfs_buf_zone, bp);
- return NULL;
+ return error;
}
bp->b_bn = map[0].bm_bn;
@@ -256,7 +258,8 @@ _xfs_buf_alloc(
XFS_STATS_INC(bp->b_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
- return bp;
+ *bpp = bp;
+ return 0;
}
/*
@@ -682,53 +685,39 @@ xfs_buf_incore(
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
* more hits than misses.
*/
-struct xfs_buf *
+int
xfs_buf_get_map(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
struct xfs_buf *bp;
struct xfs_buf *new_bp;
int error = 0;
+ *bpp = NULL;
error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
-
- switch (error) {
- case 0:
- /* cache hit */
+ if (!error)
goto found;
- case -EAGAIN:
- /* cache hit, trylock failure, caller handles failure */
- ASSERT(flags & XBF_TRYLOCK);
- return NULL;
- case -ENOENT:
- /* cache miss, go for insert */
- break;
- case -EFSCORRUPTED:
- default:
- /*
- * None of the higher layers understand failure types
- * yet, so return NULL to signal a fatal lookup error.
- */
- return NULL;
- }
+ if (error != -ENOENT)
+ return error;
- new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
- if (unlikely(!new_bp))
- return NULL;
+ error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
+ if (error)
+ return error;
error = xfs_buf_allocate_memory(new_bp, flags);
if (error) {
xfs_buf_free(new_bp);
- return NULL;
+ return error;
}
error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
if (error) {
xfs_buf_free(new_bp);
- return NULL;
+ return error;
}
if (bp != new_bp)
@@ -741,7 +730,7 @@ found:
xfs_warn(target->bt_mount,
"%s: failed to map pagesn", __func__);
xfs_buf_relse(bp);
- return NULL;
+ return error;
}
}
@@ -754,7 +743,8 @@ found:
XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
- return bp;
+ *bpp = bp;
+ return 0;
}
STATIC int
@@ -806,46 +796,77 @@ xfs_buf_reverify(
return bp->b_error;
}
-xfs_buf_t *
+int
xfs_buf_read_map(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
- const struct xfs_buf_ops *ops)
+ struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops,
+ xfs_failaddr_t fa)
{
struct xfs_buf *bp;
+ int error;
flags |= XBF_READ;
+ *bpp = NULL;
- bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (!bp)
- return NULL;
+ error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
+ if (error)
+ return error;
trace_xfs_buf_read(bp, flags, _RET_IP_);
if (!(bp->b_flags & XBF_DONE)) {
+ /* Initiate the buffer read and wait. */
XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
- _xfs_buf_read(bp, flags);
- return bp;
+ error = _xfs_buf_read(bp, flags);
+
+ /* Readahead iodone already dropped the buffer, so exit. */
+ if (flags & XBF_ASYNC)
+ return 0;
+ } else {
+ /* Buffer already read; all we need to do is check it. */
+ error = xfs_buf_reverify(bp, ops);
+
+ /* Readahead already finished; drop the buffer and exit. */
+ if (flags & XBF_ASYNC) {
+ xfs_buf_relse(bp);
+ return 0;
+ }
+
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ ASSERT(bp->b_ops != NULL || ops == NULL);
}
- xfs_buf_reverify(bp, ops);
+ /*
+ * If we've had a read error, then the contents of the buffer are
+ * invalid and should not be used. To ensure that a followup read tries
+ * to pull the buffer from disk again, we clear the XBF_DONE flag and
+ * mark the buffer stale. This ensures that anyone who has a current
+ * reference to the buffer will interpret it's contents correctly and
+ * future cache lookups will also treat it as an empty, uninitialised
+ * buffer.
+ */
+ if (error) {
+ if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
+ xfs_buf_ioerror_alert(bp, fa);
- if (flags & XBF_ASYNC) {
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
+ bp->b_flags &= ~XBF_DONE;
+ xfs_buf_stale(bp);
xfs_buf_relse(bp);
- return NULL;
+
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
+ return error;
}
- /* We do not want read in the flags */
- bp->b_flags &= ~XBF_READ;
- ASSERT(bp->b_ops != NULL || ops == NULL);
- return bp;
+ *bpp = bp;
+ return 0;
}
/*
@@ -859,11 +880,14 @@ xfs_buf_readahead_map(
int nmaps,
const struct xfs_buf_ops *ops)
{
+ struct xfs_buf *bp;
+
if (bdi_read_congested(target->bt_bdev->bd_bdi))
return;
xfs_buf_read_map(target, map, nmaps,
- XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
+ XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
+ __this_address);
}
/*
@@ -880,12 +904,13 @@ xfs_buf_read_uncached(
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
+ int error;
*bpp = NULL;
- bp = xfs_buf_get_uncached(target, numblks, flags);
- if (!bp)
- return -ENOMEM;
+ error = xfs_buf_get_uncached(target, numblks, flags, &bp);
+ if (error)
+ return error;
/* set up the buffer for a read IO */
ASSERT(bp->b_map_count == 1);
@@ -896,7 +921,7 @@ xfs_buf_read_uncached(
xfs_buf_submit(bp);
if (bp->b_error) {
- int error = bp->b_error;
+ error = bp->b_error;
xfs_buf_relse(bp);
return error;
}
@@ -905,20 +930,23 @@ xfs_buf_read_uncached(
return 0;
}
-xfs_buf_t *
+int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- int flags)
+ int flags,
+ struct xfs_buf **bpp)
{
unsigned long page_count;
int error, i;
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
+ *bpp = NULL;
+
/* flags might contain irrelevant bits, pass only what we care about */
- bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
- if (unlikely(bp == NULL))
+ error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
+ if (error)
goto fail;
page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
@@ -928,8 +956,10 @@ xfs_buf_get_uncached(
for (i = 0; i < page_count; i++) {
bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
- if (!bp->b_pages[i])
+ if (!bp->b_pages[i]) {
+ error = -ENOMEM;
goto fail_free_mem;
+ }
}
bp->b_flags |= _XBF_PAGES;
@@ -941,7 +971,8 @@ xfs_buf_get_uncached(
}
trace_xfs_buf_get_uncached(bp, _RET_IP_);
- return bp;
+ *bpp = bp;
+ return 0;
fail_free_mem:
while (--i >= 0)
@@ -951,7 +982,7 @@ xfs_buf_get_uncached(
xfs_buf_free_maps(bp);
kmem_cache_free(xfs_buf_zone, bp);
fail:
- return NULL;
+ return error;
}
/*
@@ -1205,10 +1236,10 @@ __xfs_buf_ioerror(
void
xfs_buf_ioerror_alert(
struct xfs_buf *bp,
- const char *func)
+ xfs_failaddr_t func)
{
xfs_alert(bp->b_mount,
-"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d",
+"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
-bp->b_error);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 56e081dd1d96..d79a1fe5d738 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -192,37 +192,40 @@ struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
xfs_daddr_t blkno, size_t numblks,
xfs_buf_flags_t flags);
-struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags);
-struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags,
- const struct xfs_buf_ops *ops);
+int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
+ int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
+int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
+ int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops, xfs_failaddr_t fa);
void xfs_buf_readahead_map(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
const struct xfs_buf_ops *ops);
-static inline struct xfs_buf *
+static inline int
xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
- size_t numblks)
+ size_t numblks,
+ struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_get_map(target, &map, 1, 0);
+
+ return xfs_buf_get_map(target, &map, 1, 0, bpp);
}
-static inline struct xfs_buf *
+static inline int
xfs_buf_read(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
size_t numblks,
xfs_buf_flags_t flags,
+ struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_read_map(target, &map, 1, flags, ops);
+
+ return xfs_buf_read_map(target, &map, 1, flags, bpp, ops,
+ __builtin_return_address(0));
}
static inline void
@@ -236,8 +239,8 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
- int flags);
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
+ struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
size_t numblks, int flags, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
@@ -259,7 +262,7 @@ extern void xfs_buf_ioend(struct xfs_buf *bp);
extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
-extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
+extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
extern int __xfs_buf_submit(struct xfs_buf *bp, bool);
static inline int xfs_buf_submit(struct xfs_buf *bp)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 5be8973a452c..663810e6cd59 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1113,7 +1113,7 @@ xfs_buf_iodone_callback_error(
if (bp->b_target != lasttarg ||
time_after(jiffies, (lasttime + 5*HZ))) {
lasttime = jiffies;
- xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_ioerror_alert(bp, __this_address);
}
lasttarg = bp->b_target;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index cae613620175..0b8350e84d28 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -45,7 +45,7 @@ xfs_trim_extents(
xfs_log_force(mp, XFS_LOG_SYNC);
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
- if (error || !agbp)
+ if (error)
goto out_put_perag;
cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9cfd3209f52b..d223e1ae90a6 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -320,10 +320,10 @@ xfs_dquot_disk_alloc(
dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
/* now we can just get the buffer (there's nothing to read yet) */
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0);
- if (!bp)
- return -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ return error;
bp->b_ops = &xfs_dquot_buf_ops;
/*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 5f12b5d8527a..1a88025e68a3 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -159,16 +159,15 @@ xfs_filestream_pick_ag(
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
- if (err && !trylock) {
+ if (err) {
xfs_perag_put(pag);
- return err;
+ if (err != -EAGAIN)
+ return err;
+ /* Couldn't lock the AGF, skip this AG. */
+ continue;
}
}
- /* Might fail sometimes during the 1st pass with trylock set. */
- if (!pag->pagf_init)
- goto next_ag;
-
/* Keep track of the AG with the most free blocks. */
if (pag->pagf_freeblks > maxfree) {
maxfree = pag->pagf_freeblks;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1979a0055763..c5077e6326c7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2546,6 +2546,7 @@ xfs_ifree_cluster(
struct xfs_perag *pag;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_ino_t inum;
+ int error;
inum = xic->first_ino;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
@@ -2574,12 +2575,11 @@ xfs_ifree_cluster(
* complete before we get a lock on it, and hence we may fail
* to mark all the active inodes on the buffer stale.
*/
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * igeo->blocks_per_cluster,
- XBF_UNMAPPED);
-
- if (!bp)
- return -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+ mp->m_bsize * igeo->blocks_per_cluster,
+ XBF_UNMAPPED, &bp);
+ if (error)
+ return error;
/*
* This buffer may not have been correctly initialised as we
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0d683fb96396..25cfc85dbaa7 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -294,7 +294,7 @@ xlog_recover_iodone(
* this during recovery. One strike!
*/
if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
- xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_ioerror_alert(bp, __this_address);
xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
}
}
@@ -2745,15 +2745,10 @@ xlog_recover_buffer_pass2(
if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
buf_flags |= XBF_UNMAPPED;
- bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags, NULL);
- if (!bp)
- return -ENOMEM;
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
- goto out_release;
- }
+ error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+ buf_flags, &bp, NULL);
+ if (error)
+ return error;
/*
* Recover the buffer only if we get an LSN from it and it's less than
@@ -2950,17 +2945,10 @@ xlog_recover_inode_pass2(
}
trace_xfs_log_recover_inode_recover(log, in_f);
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
- &xfs_inode_buf_ops);
- if (!bp) {
- error = -ENOMEM;
+ error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+ 0, &bp, &xfs_inode_buf_ops);
+ if (error)
goto error;
- }
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
- goto out_release;
- }
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
dip = xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -5639,7 +5627,7 @@ xlog_do_recover(
error = xfs_buf_submit(bp);
if (error) {
if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_ioerror_alert(bp, __this_address);
ASSERT(0);
}
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e723b267a247..b0ce04ffd3cd 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -143,8 +143,6 @@ xfs_reflink_find_shared(
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
return error;
- if (!agbp)
- return -ENOMEM;
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d42b5a2047e0..6209e7b6b895 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -826,12 +826,10 @@ xfs_growfs_rt_alloc(
* Get a buffer for the block.
*/
d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0);
- if (bp == NULL) {
- error = -EIO;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize, 0, &bp);
+ if (error)
goto out_trans_cancel;
- }
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index a25502bc2071..d762d42ed0ff 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -53,20 +53,10 @@ xfs_readlink_bmap_ilocked(
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
- &xfs_symlink_buf_ops);
- if (!bp)
- return -ENOMEM;
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
-
- /* bad CRC means corrupted metadata */
- if (error == -EFSBADCRC)
- error = -EFSCORRUPTED;
- goto out;
- }
+ error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+ &bp, &xfs_symlink_buf_ops);
+ if (error)
+ return error;
byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
if (pathlen < byte_cnt)
byte_cnt = pathlen;
@@ -290,12 +280,10 @@ xfs_symlink(
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- BTOBB(byte_cnt), 0);
- if (!bp) {
- error = -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0, &bp);
+ if (error)
goto out_trans_cancel;
- }
bp->b_ops = &xfs_symlink_buf_ops;
byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -433,13 +421,12 @@ xfs_inactive_symlink_rmt(
* Invalidate the block(s). No validation is done.
*/
for (i = 0; i < nmaps; i++) {
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
- XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
- if (!bp) {
- error = -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+ XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
+ &bp);
+ if (error)
goto error_trans_cancel;
- }
xfs_trans_binval(tp, bp);
}
/*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 64d7f171ebd3..752c7fef9de7 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -169,21 +169,21 @@ int xfs_trans_alloc_empty(struct xfs_mount *mp,
struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
-struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
- struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- uint flags);
+int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags,
+ struct xfs_buf **bpp);
-static inline struct xfs_buf *
+static inline int
xfs_trans_get_buf(
struct xfs_trans *tp,
struct xfs_buftarg *target,
xfs_daddr_t blkno,
int numblks,
- uint flags)
+ uint flags,
+ struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
+ return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp);
}
int xfs_trans_read_buf_map(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index b5b3a78ef31c..08174ffa2118 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -112,19 +112,22 @@ xfs_trans_bjoin(
* If the transaction pointer is NULL, make this just a normal
* get_buf() call.
*/
-struct xfs_buf *
+int
xfs_trans_get_buf_map(
struct xfs_trans *tp,
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
xfs_buf_t *bp;
struct xfs_buf_log_item *bip;
+ int error;
+ *bpp = NULL;
if (!tp)
- return xfs_buf_get_map(target, map, nmaps, flags);
+ return xfs_buf_get_map(target, map, nmaps, flags, bpp);
/*
* If we find the buffer in the cache with this transaction
@@ -146,19 +149,20 @@ xfs_trans_get_buf_map(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
trace_xfs_trans_get_buf_recur(bip);
- return bp;
+ *bpp = bp;
+ return 0;
}
- bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (bp == NULL) {
- return NULL;
- }
+ error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
+ if (error)
+ return error;
ASSERT(!bp->b_error);
_xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_get_buf(bp->b_log_item);
- return bp;
+ *bpp = bp;
+ return 0;
}
/*
@@ -276,7 +280,7 @@ xfs_trans_read_buf_map(
ASSERT(bp->b_ops != NULL);
error = xfs_buf_reverify(bp, ops);
if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_ioerror_alert(bp, __return_address);
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp,
@@ -298,36 +302,17 @@ xfs_trans_read_buf_map(
return 0;
}
- bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
- if (!bp) {
- if (!(flags & XBF_TRYLOCK))
- return -ENOMEM;
- return tp ? 0 : -EAGAIN;
- }
-
- /*
- * If we've had a read error, then the contents of the buffer are
- * invalid and should not be used. To ensure that a followup read tries
- * to pull the buffer from disk again, we clear the XBF_DONE flag and
- * mark the buffer stale. This ensures that anyone who has a current
- * reference to the buffer will interpret it's contents correctly and
- * future cache lookups will also treat it as an empty, uninitialised
- * buffer.
- */
- if (bp->b_error) {
- error = bp->b_error;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_buf_ioerror_alert(bp, __func__);
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
-
+ error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops,
+ __return_address);
+ switch (error) {
+ case 0:
+ break;
+ default:
if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
- xfs_buf_relse(bp);
-
- /* bad CRC means corrupted metadata */
- if (error == -EFSBADCRC)
- error = -EFSCORRUPTED;
+ /* fall through */
+ case -ENOMEM:
+ case -EAGAIN:
return error;
}
OpenPOWER on IntegriCloud