diff options
Diffstat (limited to 'fs')
102 files changed, 1963 insertions, 1374 deletions
@@ -1610,6 +1610,14 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, return 0; } +static void aio_poll_put_work(struct work_struct *work) +{ + struct poll_iocb *req = container_of(work, struct poll_iocb, work); + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); + + iocb_put(iocb); +} + static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1674,6 +1682,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del_init(&req->wait.entry); if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + struct kioctx *ctx = iocb->ki_ctx; + /* * Try to complete the iocb inline if we can. Use * irqsave/irqrestore because not all filesystems (e.g. fuse) @@ -1683,8 +1693,14 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); req->done = true; - spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); - iocb_put(iocb); + if (iocb->ki_eventfd && eventfd_signal_count()) { + iocb = NULL; + INIT_WORK(&req->work, aio_poll_put_work); + schedule_work(&req->work); + } + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + if (iocb) + iocb_put(iocb); } else { schedule_work(&req->work); } diff --git a/fs/attr.c b/fs/attr.c index df28035aa23e..b4bbdbd4c8ca 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -183,18 +183,12 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (ia_valid & ATTR_ATIME) { - inode->i_atime = timestamp_truncate(attr->ia_atime, - inode); - } - if (ia_valid & ATTR_MTIME) { - inode->i_mtime = timestamp_truncate(attr->ia_mtime, - inode); - } - if (ia_valid & ATTR_CTIME) { - inode->i_ctime = timestamp_truncate(attr->ia_ctime, - inode); - } + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -268,8 +262,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de attr->ia_ctime = now; if (!(ia_valid & ATTR_ATIME_SET)) attr->ia_atime = now; + else + attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); if (!(ia_valid & ATTR_MTIME_SET)) attr->ia_mtime = now; + else + attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); + if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); if (error < 0) diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index c1da294418d1..0a0823d378db 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ - debugfs.o + debugfs.o util.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index aa55f412a6e3..26be6520d3fb 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -222,8 +222,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); if (err) goto out_err; - err = ceph_pagelist_encode_string(pagelist, - XATTR_NAME_POSIX_ACL_DEFAULT, len); + ceph_pagelist_encode_string(pagelist, + XATTR_NAME_POSIX_ACL_DEFAULT, len); err = posix_acl_to_xattr(&init_user_ns, default_acl, tmp_buf, val_size2); if (err < 0) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9d09bb53c1ab..28ae0c134700 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -908,7 +908,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) ci_node); if (!__cap_is_valid(cap)) continue; - __touch_cap(cap); + if (cap->issued & mask) + __touch_cap(cap); } } return 1; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index c281f32b54f7..fb7cabd98e7b 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -33,7 +33,7 @@ static int mdsmap_show(struct seq_file *s, void *p) seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); - for (i = 0; i < mdsmap->m_num_mds; i++) { + for (i = 0; i < mdsmap->possible_max_rank; i++) { struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 2e4764fd1872..d0cd0aba5843 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1186,7 +1186,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) struct dentry *dn = di->dentry; struct ceph_mds_client *mdsc; - dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n", + dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n", di, dn, dn, di->offset); if (!list_empty(&di->lease_list)) { @@ -1567,7 +1567,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); } - dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, + dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, dentry, inode, ceph_dentry(dentry)->offset); /* always trust cached snapped dentries, snapdir dentry */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 11929d2bb594..c3b8e8e0bf17 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1974,6 +1974,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; + if (!src_fsc->have_copy_from2) + return -EOPNOTSUPP; + /* * Striped file layouts require that we copy partial objects, but the * OSD copy-from operation only supports full-object copies. Limit @@ -2101,8 +2104,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, &dst_oid, &dst_oloc, CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0); + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, + dst_ci->i_truncate_seq, dst_ci->i_truncate_size, + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); if (err) { + if (err == -EOPNOTSUPP) { + src_fsc->have_copy_from2 = false; + pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); + } dout("ceph_osdc_copy_from returned %d\n", err); if (!ret) ret = err; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index c07407586ce8..d01710a16a4a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -55,11 +55,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode->i_state & I_NEW) dout("get_inode created new inode %p %llx.%llx ino %llx\n", inode, ceph_vinop(inode), (u64)inode->i_ino); - unlock_new_inode(inode); - } dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, vino.snap, inode); @@ -88,6 +86,10 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + return inode; } @@ -728,8 +730,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, static int fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_info_in *iinfo, struct ceph_mds_reply_dirfrag *dirinfo, - struct ceph_mds_session *session, - unsigned long ttl_from, int cap_fmode, + struct ceph_mds_session *session, int cap_fmode, struct ceph_cap_reservation *caps_reservation) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; @@ -754,8 +755,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page, info_caps = le32_to_cpu(info->cap.caps); /* prealloc new cap struct */ - if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) + if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) { new_cap = ceph_get_cap(mdsc, caps_reservation); + if (!new_cap) + return -ENOMEM; + } /* * prealloc xattr data, if it looks like we'll need it. only @@ -1237,7 +1241,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) if (dir) { err = fill_inode(dir, NULL, &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1, + session, -1, &req->r_caps_reservation); if (err < 0) goto done; @@ -1302,18 +1306,22 @@ retry_lookup: err = PTR_ERR(in); goto done; } - req->r_target_inode = in; err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, - session, req->r_request_started, + session, (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && - rinfo->head->result == 0) ? req->r_fmode : -1, + rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { pr_err("fill_inode badness %p %llx.%llx\n", in, ceph_vinop(in)); + if (in->i_state & I_NEW) + discard_new_inode(in); goto done; } + req->r_target_inode = in; + if (in->i_state & I_NEW) + unlock_new_inode(in); } /* @@ -1493,12 +1501,18 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, continue; } rc = fill_inode(in, NULL, &rde->inode, NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation); + -1, &req->r_caps_reservation); if (rc < 0) { pr_err("fill_inode badness on %p got %d\n", in, rc); err = rc; + if (in->i_state & I_NEW) { + ihold(in); + discard_new_inode(in); + } + } else if (in->i_state & I_NEW) { + unlock_new_inode(in); } + /* avoid calling iput_final() in mds dispatch threads */ ceph_async_iput(in); } @@ -1694,19 +1708,24 @@ retry_lookup: } ret = fill_inode(in, NULL, &rde->inode, NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation); + -1, &req->r_caps_reservation); if (ret < 0) { pr_err("fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) { /* avoid calling iput_final() in mds * dispatch threads */ + if (in->i_state & I_NEW) { + ihold(in); + discard_new_inode(in); + } ceph_async_iput(in); } d_drop(dn); err = ret; goto next_item; } + if (in->i_state & I_NEW) + unlock_new_inode(in); if (d_really_is_negative(dn)) { if (ceph_security_xattr_deadlock(in)) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 145d46ba25ae..bbbbddf71326 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> +#include <linux/bits.h> #include "super.h" #include "mds_client.h" @@ -530,6 +531,7 @@ const char *ceph_session_state_name(int s) case CEPH_MDS_SESSION_OPEN: return "open"; case CEPH_MDS_SESSION_HUNG: return "hung"; case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_CLOSED: return "closed"; case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; case CEPH_MDS_SESSION_REJECTED: return "rejected"; @@ -537,7 +539,7 @@ const char *ceph_session_state_name(int s) } } -static struct ceph_mds_session *get_session(struct ceph_mds_session *s) +struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) { if (refcount_inc_not_zero(&s->s_ref)) { dout("mdsc get_session %p %d -> %d\n", s, @@ -568,7 +570,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, { if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; - return get_session(mdsc->sessions[mds]); + return ceph_get_mds_session(mdsc->sessions[mds]); } static bool __have_session(struct ceph_mds_client *mdsc, int mds) @@ -597,7 +599,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; - if (mds >= mdsc->mdsmap->m_num_mds) + if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); s = kzalloc(sizeof(*s), GFP_NOFS); @@ -674,7 +676,6 @@ static void __unregister_session(struct ceph_mds_client *mdsc, dout("__unregister_session mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; - s->s_state = 0; ceph_con_close(&s->s_con); ceph_put_mds_session(s); atomic_dec(&mdsc->num_sessions); @@ -878,7 +879,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) * Called under mdsc->mutex. */ static int __choose_mds(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req) + struct ceph_mds_request *req, + bool *random) { struct inode *inode; struct ceph_inode_info *ci; @@ -888,6 +890,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, u32 hash = req->r_direct_hash; bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + if (random) + *random = false; + /* * is there a specific mds we should try? ignore hint if we have * no session and the mds is not up (active or recovering). @@ -895,7 +900,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_resend_mds >= 0 && (__have_session(mdsc, req->r_resend_mds) || ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { - dout("choose_mds using resend_mds mds%d\n", + dout("%s using resend_mds mds%d\n", __func__, req->r_resend_mds); return req->r_resend_mds; } @@ -913,7 +918,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, rcu_read_lock(); inode = get_nonsnap_parent(req->r_dentry); rcu_read_unlock(); - dout("__choose_mds using snapdir's parent %p\n", inode); + dout("%s using snapdir's parent %p\n", __func__, inode); } } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ @@ -933,7 +938,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, /* direct snapped/virtual snapdir requests * based on parent dir inode */ inode = get_nonsnap_parent(parent); - dout("__choose_mds using nonsnap parent %p\n", inode); + dout("%s using nonsnap parent %p\n", __func__, inode); } else { /* dentry target */ inode = d_inode(req->r_dentry); @@ -949,8 +954,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, rcu_read_unlock(); } - dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, - (int)hash, mode); + dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash, + hash, mode); if (!inode) goto random; ci = ceph_inode(inode); @@ -968,30 +973,33 @@ static int __choose_mds(struct ceph_mds_client *mdsc, get_random_bytes(&r, 1); r %= frag.ndist; mds = frag.dist[r]; - dout("choose_mds %p %llx.%llx " - "frag %u mds%d (%d/%d)\n", - inode, ceph_vinop(inode), - frag.frag, mds, - (int)r, frag.ndist); + dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n", + __func__, inode, ceph_vinop(inode), + frag.frag, mds, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) + CEPH_MDS_STATE_ACTIVE && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } /* since this file/dir wasn't known to be * replicated, then we want to look for the * authoritative mds. */ - mode = USE_AUTH_MDS; if (frag.mds >= 0) { /* choose auth mds */ mds = frag.mds; - dout("choose_mds %p %llx.%llx " - "frag %u mds%d (auth)\n", - inode, ceph_vinop(inode), frag.frag, mds); + dout("%s %p %llx.%llx frag %u mds%d (auth)\n", + __func__, inode, ceph_vinop(inode), + frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) - goto out; + CEPH_MDS_STATE_ACTIVE) { + if (mode == USE_ANY_MDS && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, + mds)) + goto out; + } } + mode = USE_AUTH_MDS; } } @@ -1007,7 +1015,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, goto random; } mds = cap->session->s_mds; - dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", + dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__, inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); @@ -1018,8 +1026,11 @@ out: return mds; random: + if (random) + *random = true; + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); - dout("choose_mds chose random mds%d\n", mds); + dout("%s chose random mds%d\n", __func__, mds); return mds; } @@ -1045,20 +1056,21 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) return msg; } +static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; +#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) static void encode_supported_features(void **p, void *end) { - static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; - static const size_t count = ARRAY_SIZE(bits); + static const size_t count = ARRAY_SIZE(feature_bits); if (count > 0) { size_t i; - size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8; + size_t size = FEATURE_BYTES(count); BUG_ON(*p + 4 + size > end); ceph_encode_32(p, size); memset(*p, 0, size); for (i = 0; i < count; i++) - ((unsigned char*)(*p))[i / 8] |= 1 << (bits[i] % 8); + ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); *p += size; } else { BUG_ON(*p + 4 > end); @@ -1079,6 +1091,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; + size_t size, count; void *p, *end; const char* metadata[][2] = { @@ -1096,8 +1109,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 strlen(metadata[i][1]); metadata_key_count++; } + /* supported feature */ - extra_bytes += 4 + 8; + size = 0; + count = ARRAY_SIZE(feature_bits); + if (count > 0) + size = FEATURE_BYTES(count); + extra_bytes += 4 + size; /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, @@ -1117,7 +1135,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> * - * ClientSession messages with metadata are v2 + * ClientSession messages with metadata are v3 */ msg->hdr.version = cpu_to_le16(3); msg->hdr.compat_version = cpu_to_le16(1); @@ -1219,7 +1237,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *ts; int i, mds = session->s_mds; - if (mds >= mdsc->mdsmap->m_num_mds) + if (mds >= mdsc->mdsmap->possible_max_rank) return; mi = &mdsc->mdsmap->m_info[mds]; @@ -1967,7 +1985,7 @@ void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, if (mdsc->stopping) return; - get_session(session); + ceph_get_mds_session(session); if (queue_work(mdsc->fsc->cap_wq, &session->s_cap_release_work)) { dout("cap release work queued\n"); @@ -2072,7 +2090,6 @@ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); - struct timespec64 ts; if (!req) return ERR_PTR(-ENOMEM); @@ -2091,8 +2108,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - ktime_get_coarse_real_ts64(&ts); - req->r_stamp = timespec64_trunc(ts, mdsc->fsc->sb->s_time_gran); + ktime_get_coarse_real_ts64(&req->r_stamp); req->r_op = op; req->r_direct_mode = mode; @@ -2518,6 +2534,26 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, } /* + * called under mdsc->mutex + */ +static int __send_request(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_mds_request *req, + bool drop_cap_releases) +{ + int err; + + err = __prepare_send_request(mdsc, req, session->s_mds, + drop_cap_releases); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + + return err; +} + +/* * send request, or put it on the appropriate wait list. */ static void __do_request(struct ceph_mds_client *mdsc, @@ -2526,6 +2562,7 @@ static void __do_request(struct ceph_mds_client *mdsc, struct ceph_mds_session *session = NULL; int mds = -1; int err = 0; + bool random; if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) @@ -2558,15 +2595,14 @@ static void __do_request(struct ceph_mds_client *mdsc, if (!(mdsc->fsc->mount_options->flags & CEPH_MOUNT_OPT_MOUNTWAIT) && !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { - err = -ENOENT; - pr_info("probably no mds server is up\n"); + err = -EHOSTUNREACH; goto finish; } } put_request_session(req); - mds = __choose_mds(mdsc, req); + mds = __choose_mds(mdsc, req, &random); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { dout("do_request no mds or not active, waiting for map\n"); @@ -2583,7 +2619,7 @@ static void __do_request(struct ceph_mds_client *mdsc, goto finish; } } - req->r_session = get_session(session); + req->r_session = ceph_get_mds_session(session); dout("do_request mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); @@ -2594,8 +2630,12 @@ static void __do_request(struct ceph_mds_client *mdsc, goto out_session; } if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) + session->s_state == CEPH_MDS_SESSION_CLOSING) { __open_session(mdsc, session); + /* retry the same mds later */ + if (random) + req->r_resend_mds = mds; + } list_add(&req->r_wait, &session->s_waiting); goto out_session; } @@ -2606,11 +2646,7 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __prepare_send_request(mdsc, req, mds, false); - if (!err) { - ceph_msg_get(req->r_request); - ceph_con_send(&session->s_con, req->r_request); - } + err = __send_request(mdsc, session, req, false); out_session: ceph_put_mds_session(session); @@ -2863,7 +2899,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } else { - int mds = __choose_mds(mdsc, req); + int mds = __choose_mds(mdsc, req, NULL); if (mds >= 0 && mds != req->r_session->s_mds) { dout("but auth changed, so resending\n"); __do_request(mdsc, req); @@ -2879,6 +2915,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); + /* last request during umount? */ + if (mdsc->stopping && !__get_oldest_req(mdsc)) + complete_all(&mdsc->safe_umount_waiters); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { /* * We already handled the unsafe response, now do the @@ -2889,9 +2929,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ dout("got safe reply %llu, mds%d\n", tid, mds); - /* last unsafe request during umount? */ - if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } @@ -3106,7 +3143,7 @@ static void handle_session(struct ceph_mds_session *session, mutex_lock(&mdsc->mutex); if (op == CEPH_SESSION_CLOSE) { - get_session(session); + ceph_get_mds_session(session); __unregister_session(mdsc, session); } /* FIXME: this ttl calculation is generous */ @@ -3144,6 +3181,7 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect denied\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_CLOSED; cleanup_session_requests(mdsc, session); remove_session_caps(session); wake = 2; /* for good measure */ @@ -3211,7 +3249,6 @@ bad: return; } - /* * called under session->mutex. */ @@ -3220,18 +3257,12 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req, *nreq; struct rb_node *p; - int err; dout("replay_unsafe_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); - list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { - err = __prepare_send_request(mdsc, req, session->s_mds, true); - if (!err) { - ceph_msg_get(req->r_request); - ceph_con_send(&session->s_con, req->r_request); - } - } + list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) + __send_request(mdsc, session, req, true); /* * also re-send old requests when MDS enters reconnect stage. So that MDS @@ -3246,14 +3277,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, if (req->r_attempts == 0) continue; /* only old requests */ if (req->r_session && - req->r_session->s_mds == session->s_mds) { - err = __prepare_send_request(mdsc, req, - session->s_mds, true); - if (!err) { - ceph_msg_get(req->r_request); - ceph_con_send(&session->s_con, req->r_request); - } - } + req->r_session->s_mds == session->s_mds) + __send_request(mdsc, session, req, true); } mutex_unlock(&mdsc->mutex); } @@ -3764,7 +3789,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); - for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; @@ -3778,9 +3803,9 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); - if (i >= newmap->m_num_mds) { + if (i >= newmap->possible_max_rank) { /* force close session for stopped mds */ - get_session(s); + ceph_get_mds_session(s); __unregister_session(mdsc, s); __wake_requests(mdsc, &s->s_waiting); mutex_unlock(&mdsc->mutex); @@ -3835,7 +3860,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } - for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; @@ -4381,7 +4406,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { - session = get_session(mdsc->sessions[i]); + session = ceph_get_mds_session(mdsc->sessions[i]); __unregister_session(mdsc, session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); @@ -4609,11 +4634,8 @@ static struct ceph_connection *con_get(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; - if (get_session(s)) { - dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref)); + if (ceph_get_mds_session(s)) return con; - } - dout("mdsc con_get %p FAIL\n", s); return NULL; } @@ -4621,7 +4643,6 @@ static void con_put(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; - dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1); ceph_put_mds_session(s); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 14c7e8c49970..27a7446e10d3 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -17,22 +17,31 @@ #include <linux/ceph/auth.h> /* The first 8 bits are reserved for old ceph releases */ -#define CEPHFS_FEATURE_MIMIC 8 -#define CEPHFS_FEATURE_REPLY_ENCODING 9 -#define CEPHFS_FEATURE_RECLAIM_CLIENT 10 -#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11 -#define CEPHFS_FEATURE_MULTI_RECONNECT 12 +enum ceph_feature_type { + CEPHFS_FEATURE_MIMIC = 8, + CEPHFS_FEATURE_REPLY_ENCODING, + CEPHFS_FEATURE_RECLAIM_CLIENT, + CEPHFS_FEATURE_LAZY_CAP_WANTED, + CEPHFS_FEATURE_MULTI_RECONNECT, + + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT, +}; -#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ +/* + * This will always have the highest feature bit value + * as the last element of the array. + */ +#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_MULTI_RECONNECT, \ + \ + CEPHFS_FEATURE_MAX, \ } #define CEPHFS_FEATURES_CLIENT_REQUIRED {} - /* * Some lock dependencies: * @@ -151,7 +160,8 @@ enum { CEPH_MDS_SESSION_RESTARTING = 5, CEPH_MDS_SESSION_RECONNECTING = 6, CEPH_MDS_SESSION_CLOSING = 7, - CEPH_MDS_SESSION_REJECTED = 8, + CEPH_MDS_SESSION_CLOSED = 8, + CEPH_MDS_SESSION_REJECTED = 9, }; struct ceph_mds_session { @@ -174,6 +184,7 @@ struct ceph_mds_session { /* protected by s_cap_lock */ spinlock_t s_cap_lock; + refcount_t s_ref; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; int s_nr_caps; @@ -188,7 +199,6 @@ struct ceph_mds_session { unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - refcount_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ }; @@ -224,6 +234,7 @@ struct ceph_mds_request { struct rb_node r_node; struct ceph_mds_client *r_mdsc; + struct kref r_kref; int r_op; /* mds op code */ /* operation on what? */ @@ -294,7 +305,6 @@ struct ceph_mds_request { int r_resend_mds; /* mds to resend to next, if any*/ u32 r_sent_on_mseq; /* cap mseq request was sent at*/ - struct kref r_kref; struct list_head r_wait; struct completion r_completion; struct completion r_safe_completion; @@ -451,15 +461,10 @@ extern const char *ceph_mds_op_name(int op); extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); -static inline struct ceph_mds_session * -ceph_get_mds_session(struct ceph_mds_session *s) -{ - refcount_inc(&s->s_ref); - return s; -} - extern const char *ceph_session_state_name(int s); +extern struct ceph_mds_session * +ceph_get_mds_session(struct ceph_mds_session *s); extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 471bac335fae..889627817e52 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -13,30 +13,25 @@ #include "super.h" +#define CEPH_MDS_IS_READY(i, ignore_laggy) \ + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) -/* - * choose a random mds that is "up" (i.e. has a state > 0), or -1. - */ -int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) { int n = 0; int i, j; - /* special case for one mds */ - if (1 == m->m_num_mds && m->m_info[0].state > 0) - return 0; - /* count */ - for (i = 0; i < m->m_num_mds; i++) - if (m->m_info[i].state > 0) + for (i = 0; i < m->possible_max_rank; i++) + if (CEPH_MDS_IS_READY(i, ignore_laggy)) n++; if (n == 0) return -1; /* pick */ n = prandom_u32() % n; - for (j = 0, i = 0; i < m->m_num_mds; i++) { - if (m->m_info[i].state > 0) + for (j = 0, i = 0; i < m->possible_max_rank; i++) { + if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; if (j > n) break; @@ -45,6 +40,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return i; } +/* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int mds; + + mds = __mdsmap_get_random_mds(m, false); + if (mds == m->possible_max_rank || mds == -1) + mds = __mdsmap_get_random_mds(m, true); + + return mds == m->possible_max_rank ? -1 : mds; +} + #define __decode_and_drop_type(p, end, type, bad) \ do { \ if (*p + sizeof(type) > end) \ @@ -138,14 +147,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_session_autoclose = ceph_decode_32(p); m->m_max_file_size = ceph_decode_64(p); m->m_max_mds = ceph_decode_32(p); - m->m_num_mds = m->m_max_mds; - m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); + /* + * pick out the active nodes as the m_num_active_mds, the + * m_num_active_mds maybe larger than m_max_mds when decreasing + * the max_mds in cluster side, in other case it should less + * than or equal to m_max_mds. + */ + m->m_num_active_mds = n = ceph_decode_32(p); + + /* + * the possible max rank, it maybe larger than the m_num_active_mds, + * for example if the mds_max == 2 in the cluster, when the MDS(0) + * was laggy and being replaced by a new MDS, we will temporarily + * receive a new mds map with n_num_mds == 1 and the active MDS(1), + * and the mds rank >= m_num_active_mds. + */ + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); + + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); if (!m->m_info) goto nomem; /* pick out active nodes from mds_info (state > 0) */ - n = ceph_decode_32(p); for (i = 0; i < n; i++) { u64 global_id; u32 namelen; @@ -215,18 +239,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_mds_state_name(state), laggy ? "(laggy)" : ""); - if (mds < 0 || state <= 0) + if (mds < 0 || mds >= m->possible_max_rank) { + pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); continue; + } - if (mds >= m->m_num_mds) { - int new_num = max(mds + 1, m->m_num_mds * 2); - void *new_m_info = krealloc(m->m_info, - new_num * sizeof(*m->m_info), - GFP_NOFS | __GFP_ZERO); - if (!new_m_info) - goto nomem; - m->m_info = new_m_info; - m->m_num_mds = new_num; + if (state <= 0) { + pr_warn("mdsmap_decode got incorrect state(%s)\n", + ceph_mds_state_name(state)); + continue; } info = &m->m_info[mds]; @@ -247,14 +268,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->export_targets = NULL; } } - if (m->m_num_mds > m->m_max_mds) { - /* find max up mds */ - for (i = m->m_num_mds; i >= m->m_max_mds; i--) { - if (i == 0 || m->m_info[i-1].state > 0) - break; - } - m->m_num_mds = i; - } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); @@ -296,14 +309,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) for (i = 0; i < n; i++) { s32 mds = ceph_decode_32(p); - if (mds >= 0 && mds < m->m_num_mds) { + if (mds >= 0 && mds < m->possible_max_rank) { if (m->m_info[mds].laggy) num_laggy++; } } m->m_num_laggy = num_laggy; - if (n > m->m_num_mds) { + if (n > m->possible_max_rank) { void *new_m_info = krealloc(m->m_info, n * sizeof(*m->m_info), GFP_NOFS | __GFP_ZERO); @@ -311,7 +324,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) goto nomem; m->m_info = new_m_info; } - m->m_num_mds = n; + m->possible_max_rank = n; } /* inc */ @@ -382,7 +395,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->m_num_mds; i++) + for (i = 0; i < m->possible_max_rank; i++) kfree(m->m_info[i].export_targets); kfree(m->m_info); kfree(m->m_data_pg_pools); @@ -396,9 +409,9 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) return false; if (m->m_damaged) return false; - if (m->m_num_laggy > 0) + if (m->m_num_laggy == m->m_num_active_mds) return false; - for (i = 0; i < m->m_num_mds; i++) { + for (i = 0; i < m->possible_max_rank; i++) { if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) nr_active++; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 29a795f975df..bfb8aead0555 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -107,7 +107,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } - static int ceph_sync_fs(struct super_block *sb, int wait) { struct ceph_fs_client *fsc = ceph_sb_to_client(sb); @@ -211,7 +210,6 @@ struct ceph_parse_opts_ctx { /* * Parse the source parameter. Distinguish the server list from the path. - * Internally we do not include the leading '/' in the path. * * The source will look like: * <server_spec>[,<server_spec>...]:[<path>] @@ -232,12 +230,15 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - if (strlen(dev_name_end) > 1) { - kfree(fsopt->server_path); - fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); - if (!fsopt->server_path) - return -ENOMEM; - } + kfree(fsopt->server_path); + + /* + * The server_path will include the whole chars from userland + * including the leading '/'. + */ + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) + return -ENOMEM; } else { dev_name_end = dev_name + strlen(dev_name); } @@ -461,6 +462,73 @@ static int strcmp_null(const char *s1, const char *s2) return strcmp(s1, s2); } +/** + * path_remove_extra_slash - Remove the extra slashes in the server path + * @server_path: the server path and could be NULL + * + * Return NULL if the path is NULL or only consists of "/", or a string + * without any extra slashes including the leading slash(es) and the + * slash(es) at the end of the server path, such as: + * "//dir1////dir2///" --> "dir1/dir2" + */ +static char *path_remove_extra_slash(const char *server_path) +{ + const char *path = server_path; + const char *cur, *end; + char *buf, *p; + int len; + + /* if the server path is omitted */ + if (!path) + return NULL; + + /* remove all the leading slashes */ + while (*path == '/') + path++; + + /* if the server path only consists of slashes */ + if (*path == '\0') + return NULL; + + len = strlen(path); + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + end = path + len; + p = buf; + do { + cur = strchr(path, '/'); + if (!cur) + cur = end; + + len = cur - path; + + /* including one '/' */ + if (cur != end) + len += 1; + + memcpy(p, path, len); + p += len; + + while (cur <= end && *cur == '/') + cur++; + path = cur; + } while (path < end); + + *p = '\0'; + + /* + * remove the last slash if there has and just to make sure that + * we will get something like "dir1/dir2" + */ + if (*(--p) == '/') + *p = '\0'; + + return buf; +} + static int compare_mount_options(struct ceph_mount_options *new_fsopt, struct ceph_options *new_opt, struct ceph_fs_client *fsc) @@ -468,6 +536,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, struct ceph_mount_options *fsopt1 = new_fsopt; struct ceph_mount_options *fsopt2 = fsc->mount_options; int ofs = offsetof(struct ceph_mount_options, snapdir_name); + char *p1, *p2; int ret; ret = memcmp(fsopt1, fsopt2, ofs); @@ -480,9 +549,21 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); if (ret) return ret; - ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + + p1 = path_remove_extra_slash(fsopt1->server_path); + if (IS_ERR(p1)) + return PTR_ERR(p1); + p2 = path_remove_extra_slash(fsopt2->server_path); + if (IS_ERR(p2)) { + kfree(p1); + return PTR_ERR(p2); + } + ret = strcmp_null(p1, p2); + kfree(p1); + kfree(p2); if (ret) return ret; + ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); if (ret) return ret; @@ -637,6 +718,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; fsc->filp_gen = 1; + fsc->have_copy_from2 = true; atomic_long_set(&fsc->writeback_count, 0); @@ -788,7 +870,6 @@ static void destroy_caches(void) ceph_fscache_unregister(); } - /* * ceph_umount_begin - initiate forced umount. Tear down down the * mount, skipping steps that may hang while waiting for server(s). @@ -868,9 +949,6 @@ out: return root; } - - - /* * mount: join the ceph cluster, and open root directory. */ @@ -885,7 +963,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, mutex_lock(&fsc->client->mount_mutex); if (!fsc->sb->s_root) { - const char *path; + const char *path, *p; err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; @@ -897,17 +975,22 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto out; } - if (!fsc->mount_options->server_path) { - path = ""; - dout("mount opening path \\t\n"); - } else { - path = fsc->mount_options->server_path + 1; - dout("mount opening path %s\n", path); + p = path_remove_extra_slash(fsc->mount_options->server_path); + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto out; } + /* if the server path is omitted or just consists of '/' */ + if (!p) + path = ""; + else + path = p; + dout("mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); root = open_root_dentry(fsc, path, started); + kfree(p); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; @@ -1070,6 +1153,11 @@ static int ceph_get_tree(struct fs_context *fc) return 0; out_splat: + if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) { + pr_info("No mds server is up or the cluster is laggy\n"); + err = -EHOSTUNREACH; + } + ceph_mdsc_close_sessions(fsc->mdsc); deactivate_locked_super(sb); goto out_final; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3bf1a01cd536..1e456a9011bb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -106,6 +106,8 @@ struct ceph_fs_client { unsigned long last_auto_reconnect; bool blacklisted; + bool have_copy_from2; + u32 filp_gen; loff_t max_file_size; diff --git a/fs/ceph/util.c b/fs/ceph/util.c new file mode 100644 index 000000000000..2c34875675bf --- /dev/null +++ b/fs/ceph/util.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some non-inline ceph helpers + */ +#include <linux/module.h> +#include <linux/ceph/types.h> + +/* + * return true if @layout appears to be valid + */ +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) +{ + __u32 su = layout->stripe_unit; + __u32 sc = layout->stripe_count; + __u32 os = layout->object_size; + + /* stripe unit, object size must be non-zero, 64k increment */ + if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + /* object size must be a multiple of stripe unit */ + if (os < su || os % su) + return 0; + /* stripe count must be non-zero */ + if (!sc) + return 0; + return 1; +} + +void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); + fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); + fl->object_size = le32_to_cpu(legacy->fl_object_size); + fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); + if (fl->pool_id == 0 && fl->stripe_unit == 0 && + fl->stripe_count == 0 && fl->object_size == 0) + fl->pool_id = -1; +} + +void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); + legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); + legacy->fl_object_size = cpu_to_le32(fl->object_size); + if (fl->pool_id >= 0) + legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); + else + legacy->fl_pg_pool = 0; +} + +int ceph_flags_to_mode(int flags) +{ + int mode; + +#ifdef O_DIRECTORY /* fixme */ + if ((flags & O_DIRECTORY) == O_DIRECTORY) + return CEPH_FILE_MODE_PIN; +#endif + + switch (flags & O_ACCMODE) { + case O_WRONLY: + mode = CEPH_FILE_MODE_WR; + break; + case O_RDONLY: + mode = CEPH_FILE_MODE_RD; + break; + case O_RDWR: + case O_ACCMODE: /* this is what the VFS does */ + mode = CEPH_FILE_MODE_RDWR; + break; + } +#ifdef O_LAZY + if (flags & O_LAZY) + mode |= CEPH_FILE_MODE_LAZY; +#endif + + return mode; +} + +int ceph_caps_for_mode(int mode) +{ + int caps = CEPH_CAP_PIN; + + if (mode & CEPH_FILE_MODE_RD) + caps |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; + if (mode & CEPH_FILE_MODE_WR) + caps |= CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | + CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + if (mode & CEPH_FILE_MODE_LAZY) + caps |= CEPH_CAP_FILE_LAZYIO; + + return caps; +} diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index cb18ee637cb7..7b8a070a782d 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -655,7 +655,7 @@ static int __build_xattrs(struct inode *inode) u32 len; const char *name, *val; struct ceph_inode_info *ci = ceph_inode(inode); - int xattr_version; + u64 xattr_version; struct ceph_inode_xattr **xattrs = NULL; int err = 0; int i; @@ -851,7 +851,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, req_mask = __get_request_mask(inode); spin_lock(&ci->i_ceph_lock); - dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name, ci->i_xattrs.version, ci->i_xattrs.index_version); if (ci->i_xattrs.version == 0 || @@ -1078,7 +1078,8 @@ retry: } } - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); + dout("setxattr %p name '%s' issued %s\n", inode, name, + ceph_cap_string(issued)); __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, name_len, val_len); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 19f6e592b941..276e4b5ea8e0 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -611,12 +611,12 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file) return single_open(file, cifs_stats_proc_show, NULL); } -static const struct file_operations cifs_stats_proc_fops = { - .open = cifs_stats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_stats_proc_write, +static const struct proc_ops cifs_stats_proc_ops = { + .proc_open = cifs_stats_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_stats_proc_write, }; #ifdef CONFIG_CIFS_SMB_DIRECT @@ -640,12 +640,12 @@ static int name##_open(struct inode *inode, struct file *file) \ return single_open(file, name##_proc_show, NULL); \ } \ \ -static const struct file_operations cifs_##name##_proc_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - .write = name##_write, \ +static const struct proc_ops cifs_##name##_proc_fops = { \ + .proc_open = name##_open, \ + .proc_read = seq_read, \ + .proc_lseek = seq_lseek, \ + .proc_release = single_release, \ + .proc_write = name##_write, \ } PROC_FILE_DEFINE(rdma_readwrite_threshold); @@ -659,11 +659,11 @@ PROC_FILE_DEFINE(smbd_receive_credit_max); #endif static struct proc_dir_entry *proc_fs_cifs; -static const struct file_operations cifsFYI_proc_fops; -static const struct file_operations cifs_lookup_cache_proc_fops; -static const struct file_operations traceSMB_proc_fops; -static const struct file_operations cifs_security_flags_proc_fops; -static const struct file_operations cifs_linux_ext_proc_fops; +static const struct proc_ops cifsFYI_proc_ops; +static const struct proc_ops cifs_lookup_cache_proc_ops; +static const struct proc_ops traceSMB_proc_ops; +static const struct proc_ops cifs_security_flags_proc_ops; +static const struct proc_ops cifs_linux_ext_proc_ops; void cifs_proc_init(void) @@ -678,18 +678,18 @@ cifs_proc_init(void) proc_create_single("open_files", 0400, proc_fs_cifs, cifs_debug_files_proc_show); - proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops); - proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops); - proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops); + proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops); + proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops); proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs, - &cifs_linux_ext_proc_fops); + &cifs_linux_ext_proc_ops); proc_create("SecurityFlags", 0644, proc_fs_cifs, - &cifs_security_flags_proc_fops); + &cifs_security_flags_proc_ops); proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, - &cifs_lookup_cache_proc_fops); + &cifs_lookup_cache_proc_ops); #ifdef CONFIG_CIFS_DFS_UPCALL - proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops); + proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif #ifdef CONFIG_CIFS_SMB_DIRECT @@ -774,12 +774,12 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations cifsFYI_proc_fops = { - .open = cifsFYI_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifsFYI_proc_write, +static const struct proc_ops cifsFYI_proc_ops = { + .proc_open = cifsFYI_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifsFYI_proc_write, }; static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) @@ -805,12 +805,12 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file, return count; } -static const struct file_operations cifs_linux_ext_proc_fops = { - .open = cifs_linux_ext_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_linux_ext_proc_write, +static const struct proc_ops cifs_linux_ext_proc_ops = { + .proc_open = cifs_linux_ext_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_linux_ext_proc_write, }; static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) @@ -836,12 +836,12 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file, return count; } -static const struct file_operations cifs_lookup_cache_proc_fops = { - .open = cifs_lookup_cache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_lookup_cache_proc_write, +static const struct proc_ops cifs_lookup_cache_proc_ops = { + .proc_open = cifs_lookup_cache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_lookup_cache_proc_write, }; static int traceSMB_proc_show(struct seq_file *m, void *v) @@ -867,12 +867,12 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations traceSMB_proc_fops = { - .open = traceSMB_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = traceSMB_proc_write, +static const struct proc_ops traceSMB_proc_ops = { + .proc_open = traceSMB_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = traceSMB_proc_write, }; static int cifs_security_flags_proc_show(struct seq_file *m, void *v) @@ -978,12 +978,12 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, return count; } -static const struct file_operations cifs_security_flags_proc_fops = { - .open = cifs_security_flags_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_security_flags_proc_write, +static const struct proc_ops cifs_security_flags_proc_ops = { + .proc_open = cifs_security_flags_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_security_flags_proc_write, }; #else inline void cifs_proc_init(void) diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 9a384d1e27b4..43c1b43a07ec 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -8,6 +8,7 @@ #include <linux/jhash.h> #include <linux/ktime.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #include <linux/nls.h> #include <linux/workqueue.h> #include "cifsglob.h" @@ -211,12 +212,12 @@ static int dfscache_proc_open(struct inode *inode, struct file *file) return single_open(file, dfscache_proc_show, NULL); } -const struct file_operations dfscache_proc_fops = { - .open = dfscache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = dfscache_proc_write, +const struct proc_ops dfscache_proc_ops = { + .proc_open = dfscache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = dfscache_proc_write, }; #ifdef CONFIG_CIFS_DEBUG2 diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index 76c732943f5f..99ee44f8ad07 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -24,7 +24,7 @@ struct dfs_cache_tgt_iterator { extern int dfs_cache_init(void); extern void dfs_cache_destroy(void); -extern const struct file_operations dfscache_proc_fops; +extern const struct proc_ops dfscache_proc_ops; extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_codepage, int remap, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9b547f7f5f5d..676e96a7a9f0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -113,6 +113,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) } /* revalidate if mtime or size have changed */ + fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", @@ -162,6 +163,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); + fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); + fattr->cf_atime = timestamp_truncate(fattr->cf_atime, inode); + fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode); /* we do not want atime to be less than mtime, it broke some apps */ if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0) inode->i_atime = fattr->cf_mtime; @@ -329,8 +333,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; - ktime_get_real_ts64(&fattr->cf_mtime); - fattr->cf_mtime = timespec64_trunc(fattr->cf_mtime, sb->s_time_gran); + ktime_get_coarse_real_ts64(&fattr->cf_mtime); fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL; @@ -609,10 +612,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (info->LastAccessTime) fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - else { - ktime_get_real_ts64(&fattr->cf_atime); - fattr->cf_atime = timespec64_trunc(fattr->cf_atime, sb->s_time_gran); - } + else + ktime_get_coarse_real_ts64(&fattr->cf_atime); fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 680aba9c00d5..fd0b5dd68f9e 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -76,14 +76,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (ia_valid & ATTR_GID) sd_iattr->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timestamp_truncate(iattr->ia_atime, - inode); + sd_iattr->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timestamp_truncate(iattr->ia_mtime, - inode); + sd_iattr->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timestamp_truncate(iattr->ia_ctime, - inode); + sd_iattr->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index dc6cffc4feba..e742dfc66933 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -332,7 +332,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent)); - dentry = lookup_one_len(name, parent, strlen(name)); + if (unlikely(IS_DEADDIR(d_inode(parent)))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", @@ -681,62 +684,15 @@ static void __debugfs_file_removed(struct dentry *dentry) wait_for_completion(&fsd->active_users_drained); } -static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) -{ - int ret = 0; - - if (simple_positive(dentry)) { - dget(dentry); - if (d_is_dir(dentry)) { - ret = simple_rmdir(d_inode(parent), dentry); - if (!ret) - fsnotify_rmdir(d_inode(parent), dentry); - } else { - simple_unlink(d_inode(parent), dentry); - fsnotify_unlink(d_inode(parent), dentry); - } - if (!ret) - d_delete(dentry); - if (d_is_reg(dentry)) - __debugfs_file_removed(dentry); - dput(dentry); - } - return ret; -} - -/** - * debugfs_remove - removes a file or directory from the debugfs filesystem - * @dentry: a pointer to a the dentry of the file or directory to be - * removed. If this parameter is NULL or an error value, nothing - * will be done. - * - * This function removes a file or directory in debugfs that was previously - * created with a call to another debugfs function (like - * debugfs_create_file() or variants thereof.) - * - * This function is required to be called in order for the file to be - * removed, no automatic cleanup of files will happen when a module is - * removed, you are responsible here. - */ -void debugfs_remove(struct dentry *dentry) +static void remove_one(struct dentry *victim) { - struct dentry *parent; - int ret; - - if (IS_ERR_OR_NULL(dentry)) - return; - - parent = dentry->d_parent; - inode_lock(d_inode(parent)); - ret = __debugfs_remove(dentry, parent); - inode_unlock(d_inode(parent)); - if (!ret) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + if (d_is_reg(victim)) + __debugfs_file_removed(victim); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove); /** - * debugfs_remove_recursive - recursively removes a directory + * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * @@ -748,65 +704,16 @@ EXPORT_SYMBOL_GPL(debugfs_remove); * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ -void debugfs_remove_recursive(struct dentry *dentry) +void debugfs_remove(struct dentry *dentry) { - struct dentry *child, *parent; - if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry; - down: - inode_lock(d_inode(parent)); - loop: - /* - * The parent->d_subdirs is protected by the d_lock. Outside that - * lock, the child can be unlinked and set to be freed which can - * use the d_u.d_child as the rcu head and corrupt this list. - */ - spin_lock(&parent->d_lock); - list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!simple_positive(child)) - continue; - - /* perhaps simple_empty(child) makes more sense */ - if (!list_empty(&child->d_subdirs)) { - spin_unlock(&parent->d_lock); - inode_unlock(d_inode(parent)); - parent = child; - goto down; - } - - spin_unlock(&parent->d_lock); - - if (!__debugfs_remove(child, parent)) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - - /* - * The parent->d_lock protects agaist child from unlinking - * from d_subdirs. When releasing the parent->d_lock we can - * no longer trust that the next pointer is valid. - * Restart the loop. We'll skip this one with the - * simple_positive() check. - */ - goto loop; - } - spin_unlock(&parent->d_lock); - - inode_unlock(d_inode(parent)); - child = parent; - parent = parent->d_parent; - inode_lock(d_inode(parent)); - - if (child != dentry) - /* go up */ - goto loop; - - if (!__debugfs_remove(child, parent)) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - inode_unlock(d_inode(parent)); + simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); + simple_recursive_removal(dentry, remove_one); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove_recursive); +EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_rename - rename a file/directory in the debugfs filesystem diff --git a/fs/eventfd.c b/fs/eventfd.c index 8aa0ea8c55e8..78e41c7c3d05 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -24,6 +24,8 @@ #include <linux/seq_file.h> #include <linux/idr.h> +DEFINE_PER_CPU(int, eventfd_wake_count); + static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { @@ -60,12 +62,25 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) { unsigned long flags; + /* + * Deadlock or stack overflow issues can happen if we recurse here + * through waitqueue wakeup handlers. If the caller users potentially + * nested waitqueues with custom wakeup handlers, then it should + * check eventfd_signal_count() before calling this function. If + * it returns true, the eventfd_signal() call should be deferred to a + * safe context. + */ + if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) + return 0; + spin_lock_irqsave(&ctx->wqh.lock, flags); + this_cpu_inc(eventfd_wake_count); if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); + this_cpu_dec(eventfd_wake_count); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88b213bd32bc..8434217549b3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6043,7 +6043,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); - } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 86ddbb55d2b1..0d4da644df3b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -829,18 +829,12 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (ia_valid & ATTR_ATIME) { - inode->i_atime = timestamp_truncate(attr->ia_atime, - inode); - } - if (ia_valid & ATTR_MTIME) { - inode->i_mtime = timestamp_truncate(attr->ia_mtime, - inode); - } - if (ia_valid & ATTR_CTIME) { - inode->i_ctime = timestamp_truncate(attr->ia_ctime, - inode); - } + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3314a0f3405e..9d02cdcdbb07 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -875,7 +875,7 @@ static int truncate_dnode(struct dnode_of_data *dn) /* get direct node */ page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); - if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) + if (PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 1e08bd54c5fb..f1b2a1fc2a6a 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -271,6 +271,14 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) { return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } + +static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) +{ + if (ts.tv_nsec) + ts.tv_nsec -= ts.tv_nsec % 10000000UL; + return ts; +} + /* * truncate the various times with appropriate granularity: * root inode: @@ -308,7 +316,7 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) } if (flags & S_CTIME) { if (sbi->options.isvfat) - inode->i_ctime = timespec64_trunc(*now, 10000000); + inode->i_ctime = fat_timespec64_trunc_10ms(*now); else inode->i_ctime = fat_timespec64_trunc_2secs(*now); } diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 9616af3768e1..08e91efbce53 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -111,7 +111,7 @@ extern void fscache_enqueue_object(struct fscache_object *); * object-list.c */ #ifdef CONFIG_FSCACHE_OBJECT_LIST -extern const struct file_operations fscache_objlist_fops; +extern const struct proc_ops fscache_objlist_proc_ops; extern void fscache_objlist_add(struct fscache_object *); extern void fscache_objlist_remove(struct fscache_object *); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 72ebfe578f40..e106a1a1600d 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -7,6 +7,7 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> +#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/key.h> @@ -405,9 +406,9 @@ static int fscache_objlist_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -const struct file_operations fscache_objlist_fops = { - .open = fscache_objlist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = fscache_objlist_release, +const struct proc_ops fscache_objlist_proc_ops = { + .proc_open = fscache_objlist_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = fscache_objlist_release, }; diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 5523446e2952..90a7bc22f7e1 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -35,7 +35,7 @@ int __init fscache_proc_init(void) #ifdef CONFIG_FSCACHE_OBJECT_LIST if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, - &fscache_objlist_fops)) + &fscache_objlist_proc_ops)) goto error_objects; #endif diff --git a/fs/inode.c b/fs/inode.c index ea15c6d9f274..c7418b0b4168 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1683,12 +1683,9 @@ EXPORT_SYMBOL(generic_update_time); */ static int update_time(struct inode *inode, struct timespec64 *time, int flags) { - int (*update_time)(struct inode *, struct timespec64 *, int); - - update_time = inode->i_op->update_time ? inode->i_op->update_time : - generic_update_time; - - return update_time(inode, time, flags); + if (inode->i_op->update_time) + return inode->i_op->update_time(inode, time, flags); + return generic_update_time(inode, time, flags); } /** @@ -2154,30 +2151,6 @@ void inode_nohighmem(struct inode *inode) EXPORT_SYMBOL(inode_nohighmem); /** - * timespec64_trunc - Truncate timespec64 to a granularity - * @t: Timespec64 - * @gran: Granularity in ns. - * - * Truncate a timespec64 to a granularity. Always rounds down. gran must - * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). - */ -struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran) -{ - /* Avoid division in the common cases 1 ns and 1 s. */ - if (gran == 1) { - /* nothing */ - } else if (gran == NSEC_PER_SEC) { - t.tv_nsec = 0; - } else if (gran > 1 && gran < NSEC_PER_SEC) { - t.tv_nsec -= t.tv_nsec % gran; - } else { - WARN(1, "illegal file time granularity: %u", gran); - } - return t; -} -EXPORT_SYMBOL(timespec64_trunc); - -/** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec * @inode: inode being updated diff --git a/fs/io_uring.c b/fs/io_uring.c index 1806afddfea5..77f22c3da30f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -585,8 +585,7 @@ struct io_submit_state { * io_kiocb alloc cache */ void *reqs[IO_IOPOLL_BATCH]; - unsigned int free_reqs; - unsigned int cur_req; + unsigned int free_reqs; /* * File reference cache @@ -754,6 +753,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_grab_files(struct io_kiocb *req); +static void io_ring_file_ref_flush(struct fixed_file_data *data); static struct kmem_cache *req_cachep; @@ -1020,21 +1020,28 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) { + if (!ctx->cq_ev_fd) + return false; if (!ctx->eventfd_async) return true; return io_wq_current_is_worker() || in_interrupt(); } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx)) + if (trigger_ev) eventfd_signal(ctx->cq_ev_fd, 1); } +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx)); +} + /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1183,12 +1190,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, ret = 1; } state->free_reqs = ret - 1; - state->cur_req = 1; - req = state->reqs[0]; + req = state->reqs[ret - 1]; } else { - req = state->reqs[state->cur_req]; state->free_reqs--; - state->cur_req++; + req = state->reqs[state->free_reqs]; } got_it: @@ -1855,9 +1860,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned ioprio; int ret; - if (!req->file) - return -EBADF; - if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; @@ -1866,8 +1868,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_CUR_POS; kiocb->ki_pos = req->file->f_pos; } - kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); + kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + if (unlikely(ret)) + return ret; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -1879,10 +1884,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } else kiocb->ki_ioprio = get_current_ioprio(); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); - if (unlikely(ret)) - return ret; - /* don't allow async punt if RWF_NOWAIT was requested */ if ((kiocb->ki_flags & IOCB_NOWAIT) || (req->file->f_flags & O_NONBLOCK)) @@ -2164,10 +2165,12 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, { if (!io_op_defs[req->opcode].async_ctx) return 0; - if (!req->io && io_alloc_async_ctx(req)) - return -ENOMEM; + if (!req->io) { + if (io_alloc_async_ctx(req)) + return -ENOMEM; - io_req_map_rw(req, io_size, iovec, fast_iov, iter); + io_req_map_rw(req, io_size, iovec, fast_iov, iter); + } req->work.func = io_rw_async; return 0; } @@ -2724,9 +2727,16 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, struct io_fadvise *fa = &req->fadvise; int ret; - /* DONTNEED may block, others _should_ not */ - if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock) - return -EAGAIN; + if (force_nonblock) { + switch (fa->advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + break; + default: + return -EAGAIN; + } + } ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) @@ -2837,16 +2847,13 @@ static void io_close_finish(struct io_wq_work **workptr) int ret; ret = filp_close(req->close.put_file, req->work.files); - if (ret < 0) { + if (ret < 0) req_set_fail_links(req); - } io_cqring_add_event(req, ret); } fput(req->close.put_file); - /* we bypassed the re-issue, drop the submission reference */ - io_put_req(req); io_put_req_find_next(req, &nxt); if (nxt) io_wq_assign_next(workptr, nxt); @@ -2888,7 +2895,13 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, eagain: req->work.func = io_close_finish; - return -EAGAIN; + /* + * Do manual async queue here to avoid grabbing files - we don't + * need the files, and it'll cause io_close_finish() to close + * the file again and cause a double CQE entry for this request + */ + io_queue_async_work(req); + return 0; } static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -3083,7 +3096,8 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, else if (force_nonblock) flags |= MSG_DONTWAIT; - ret = __sys_sendmsg_sock(sock, &msg, flags); + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); if (force_nonblock && ret == -EAGAIN) return -EAGAIN; if (ret == -ERESTARTSYS) @@ -3109,6 +3123,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); if (!io || req->opcode == IORING_OP_RECV) return 0; @@ -3227,7 +3242,7 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, else if (force_nonblock) flags |= MSG_DONTWAIT; - ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags); + ret = sock_recvmsg(sock, &msg, flags); if (force_nonblock && ret == -EAGAIN) return -EAGAIN; if (ret == -ERESTARTSYS) @@ -3561,6 +3576,14 @@ static void io_poll_flush(struct io_wq_work **workptr) __io_poll_flush(req->ctx, nodes); } +static void io_poll_trigger_evfd(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + + eventfd_signal(req->ctx->cq_ev_fd, 1); + io_put_req(req); +} + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -3586,14 +3609,22 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (llist_empty(&ctx->poll_llist) && spin_trylock_irqsave(&ctx->completion_lock, flags)) { + bool trigger_ev; + hash_del(&req->hash_node); io_poll_complete(req, mask, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - req = NULL; + trigger_ev = io_should_trigger_evfd(ctx); + if (trigger_ev && eventfd_signal_count()) { + trigger_ev = false; + req->work.func = io_poll_trigger_evfd; + } else { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + req = NULL; + } + spin_unlock_irqrestore(&ctx->completion_lock, flags); + __io_cqring_ev_posted(ctx, trigger_ev); } else { req->result = mask; req->llist_node.next = NULL; @@ -4815,8 +4846,7 @@ static void io_submit_state_end(struct io_submit_state *state) blk_finish_plug(&state->plug); io_file_put(state); if (state->free_reqs) - kmem_cache_free_bulk(req_cachep, state->free_reqs, - &state->reqs[state->cur_req]); + kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); } /* @@ -5041,7 +5071,8 @@ static int io_sq_thread(void *data) * reap events and wake us up. */ if (inflight || - (!time_after(jiffies, timeout) && ret != -EBUSY)) { + (!time_after(jiffies, timeout) && ret != -EBUSY && + !percpu_ref_is_dying(&ctx->refs))) { cond_resched(); continue; } @@ -5231,15 +5262,10 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) if (!data) return -ENXIO; - /* protect against inflight atomic switch, which drops the ref */ - percpu_ref_get(&data->refs); - /* wait for existing switches */ - flush_work(&data->ref_work); percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); - wait_for_completion(&data->done); - percpu_ref_put(&data->refs); - /* flush potential new switch */ flush_work(&data->ref_work); + wait_for_completion(&data->done); + io_ring_file_ref_flush(data); percpu_ref_exit(&data->refs); __io_sqe_files_unregister(ctx); @@ -5477,14 +5503,11 @@ struct io_file_put { struct completion *done; }; -static void io_ring_file_ref_switch(struct work_struct *work) +static void io_ring_file_ref_flush(struct fixed_file_data *data) { struct io_file_put *pfile, *tmp; - struct fixed_file_data *data; struct llist_node *node; - data = container_of(work, struct fixed_file_data, ref_work); - while ((node = llist_del_all(&data->put_llist)) != NULL) { llist_for_each_entry_safe(pfile, tmp, node, llist) { io_ring_file_put(data->ctx, pfile->file); @@ -5494,7 +5517,14 @@ static void io_ring_file_ref_switch(struct work_struct *work) kfree(pfile); } } +} +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct fixed_file_data *data; + + data = container_of(work, struct fixed_file_data, ref_work); + io_ring_file_ref_flush(data); percpu_ref_get(&data->refs); percpu_ref_switch_to_percpu(&data->refs); } @@ -5505,8 +5535,14 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) data = container_of(ref, struct fixed_file_data, refs); - /* we can't safely switch from inside this context, punt to wq */ - queue_work(system_wq, &data->ref_work); + /* + * We can't safely switch from inside this context, punt to wq. If + * the table ref is going away, the table is being unregistered. + * Don't queue up the async work for that case, the caller will + * handle it. + */ + if (!percpu_ref_is_dying(&data->refs)) + queue_work(system_wq, &data->ref_work); } static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -6295,6 +6331,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + /* + * Wait for sq thread to idle, if we have one. It won't spin on new + * work after we've killed the ctx ref above. This is important to do + * before we cancel existing commands, as the thread could otherwise + * be queueing new work post that. If that's work we need to cancel, + * it could cause shutdown to hang. + */ + while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait)) + cpu_relax(); + io_kill_timeouts(ctx); io_poll_remove_all(ctx); @@ -6501,6 +6547,80 @@ out_fput: return submitted ? submitted : ret; } +static int io_uring_show_cred(int id, void *p, void *data) +{ + const struct cred *cred = p; + struct seq_file *m = data; + struct user_namespace *uns = seq_user_ns(m); + struct group_info *gi; + kernel_cap_t cap; + unsigned __capi; + int g; + + seq_printf(m, "%5d\n", id); + seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); + seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); + seq_puts(m, "\n\tGroups:\t"); + gi = cred->group_info; + for (g = 0; g < gi->ngroups; g++) { + seq_put_decimal_ull(m, g ? " " : "", + from_kgid_munged(uns, gi->gid[g])); + } + seq_puts(m, "\n\tCapEff:\t"); + cap = cred->cap_effective; + CAP_FOR_EACH_U32(__capi) + seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); + seq_putc(m, '\n'); + return 0; +} + +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) +{ + int i; + + mutex_lock(&ctx->uring_lock); + seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); + for (i = 0; i < ctx->nr_user_files; i++) { + struct fixed_file_table *table; + struct file *f; + + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; + f = table->files[i & IORING_FILE_TABLE_MASK]; + if (f) + seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); + else + seq_printf(m, "%5u: <none>\n", i); + } + seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *buf = &ctx->user_bufs[i]; + + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, + (unsigned int) buf->len); + } + if (!idr_is_empty(&ctx->personality_idr)) { + seq_printf(m, "Personalities:\n"); + idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); + } + mutex_unlock(&ctx->uring_lock); +} + +static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct io_ring_ctx *ctx = f->private_data; + + if (percpu_ref_tryget(&ctx->refs)) { + __io_uring_show_fdinfo(ctx, m); + percpu_ref_put(&ctx->refs); + } +} + static const struct file_operations io_uring_fops = { .release = io_uring_release, .flush = io_uring_flush, @@ -6511,6 +6631,7 @@ static const struct file_operations io_uring_fops = { #endif .poll = io_uring_poll, .fasync = io_uring_fasync, + .show_fdinfo = io_uring_show_fdinfo, }; static int io_allocate_scq_urings(struct io_ring_ctx *ctx, @@ -6963,6 +7084,39 @@ out_fput: static int __init io_uring_init(void) { +#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ + BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ + BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ +} while (0) + +#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ + __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) + BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); + BUILD_BUG_SQE_ELEM(0, __u8, opcode); + BUILD_BUG_SQE_ELEM(1, __u8, flags); + BUILD_BUG_SQE_ELEM(2, __u16, ioprio); + BUILD_BUG_SQE_ELEM(4, __s32, fd); + BUILD_BUG_SQE_ELEM(8, __u64, off); + BUILD_BUG_SQE_ELEM(8, __u64, addr2); + BUILD_BUG_SQE_ELEM(16, __u64, addr); + BUILD_BUG_SQE_ELEM(24, __u32, len); + BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); + BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); + BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); + BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); + BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); + BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); + BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); + BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); + BUILD_BUG_SQE_ELEM(28, __u32, open_flags); + BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); + BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); + BUILD_BUG_SQE_ELEM(32, __u64, user_data); + BUILD_BUG_SQE_ELEM(40, __u16, buf_index); + BUILD_BUG_SQE_ELEM(42, __u16, personality); + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 60bf8ff78913..eb8ca446d1ab 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1074,12 +1074,11 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations jbd2_seq_info_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = jbd2_seq_info_release, +static const struct proc_ops jbd2_info_proc_ops = { + .proc_open = jbd2_seq_info_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = jbd2_seq_info_release, }; static struct proc_dir_entry *proc_jbd2_stats; @@ -1089,7 +1088,7 @@ static void jbd2_stats_proc_init(journal_t *journal) journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { proc_create_data("info", S_IRUGO, journal->j_proc_entry, - &jbd2_seq_info_fops, journal); + &jbd2_info_proc_ops, journal); } } diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 888cdd685a1e..44b62b3c322e 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -43,12 +43,12 @@ static ssize_t jfs_loglevel_proc_write(struct file *file, return count; } -static const struct file_operations jfs_loglevel_proc_fops = { - .open = jfs_loglevel_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = jfs_loglevel_proc_write, +static const struct proc_ops jfs_loglevel_proc_ops = { + .proc_open = jfs_loglevel_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = jfs_loglevel_proc_write, }; #endif @@ -68,7 +68,7 @@ void jfs_proc_init(void) #endif #ifdef CONFIG_JFS_DEBUG proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show); - proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops); + proc_create("loglevel", 0, base, &jfs_loglevel_proc_ops); #endif } diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index caade185e568..7dfcab2a2da6 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -4027,7 +4027,6 @@ static int dbGetL2AGSize(s64 nblocks) */ #define MAXL0PAGES (1 + LPERCTL) #define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) -#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) /* * convert number of map pages to the zero origin top dmapctl level diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index eac277c63d42..d0f7a5abd9a9 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -160,9 +160,9 @@ static inline void set_inode_attr(struct inode *inode, { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; - inode->i_atime = timestamp_truncate(attrs->ia_atime, inode); - inode->i_mtime = timestamp_truncate(attrs->ia_mtime, inode); - inode->i_ctime = timestamp_truncate(attrs->ia_ctime, inode); + inode->i_atime = attrs->ia_atime; + inode->i_mtime = attrs->ia_mtime; + inode->i_ctime = attrs->ia_ctime; } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) diff --git a/fs/libfs.c b/fs/libfs.c index 1463b038ffc4..c686bd9caac6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -19,6 +19,7 @@ #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -239,6 +240,75 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) +{ + struct dentry *child = NULL; + struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; + + spin_lock(&parent->d_lock); + while ((p = p->next) != &parent->d_subdirs) { + struct dentry *d = container_of(p, struct dentry, d_child); + if (simple_positive(d)) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + child = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(child)) + break; + } + } + spin_unlock(&parent->d_lock); + dput(prev); + return child; +} + +void simple_recursive_removal(struct dentry *dentry, + void (*callback)(struct dentry *)) +{ + struct dentry *this = dget(dentry); + while (true) { + struct dentry *victim = NULL, *child; + struct inode *inode = this->d_inode; + + inode_lock(inode); + if (d_is_dir(this)) + inode->i_flags |= S_DEAD; + while ((child = find_next_child(this, victim)) == NULL) { + // kill and ascend + // update metadata while it's still locked + inode->i_ctime = current_time(inode); + clear_nlink(inode); + inode_unlock(inode); + victim = this; + this = this->d_parent; + inode = this->d_inode; + inode_lock(inode); + if (simple_positive(victim)) { + d_invalidate(victim); // avoid lost mounts + if (d_is_dir(victim)) + fsnotify_rmdir(inode, victim); + else + fsnotify_unlink(inode, victim); + if (callback) + callback(victim); + dput(victim); // unpin it + } + if (victim == dentry) { + inode->i_ctime = inode->i_mtime = + current_time(inode); + if (d_is_dir(dentry)) + drop_nlink(inode); + inode_unlock(inode); + dput(dentry); + return; + } + } + inode_unlock(inode); + this = child; + } +} +EXPORT_SYMBOL(simple_recursive_removal); + static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c index ca9228a56d65..a01f08c8c2f3 100644 --- a/fs/lockd/procfs.c +++ b/fs/lockd/procfs.c @@ -60,11 +60,11 @@ nlm_end_grace_read(struct file *file, char __user *buf, size_t size, return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp)); } -static const struct file_operations lockd_end_grace_operations = { - .write = nlm_end_grace_write, - .read = nlm_end_grace_read, - .llseek = default_llseek, - .release = simple_transaction_release, +static const struct proc_ops lockd_end_grace_proc_ops = { + .proc_write = nlm_end_grace_write, + .proc_read = nlm_end_grace_read, + .proc_lseek = default_llseek, + .proc_release = simple_transaction_release, }; int __init @@ -76,7 +76,7 @@ lockd_create_procfs(void) if (!entry) return -ENOMEM; entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry, - &lockd_end_grace_operations); + &lockd_end_grace_proc_ops); if (!entry) { remove_proc_entry("fs/lockd", NULL); return -ENOMEM; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 11b42c523f04..7eb919f1b13f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -157,11 +157,11 @@ static int exports_proc_open(struct inode *inode, struct file *file) return exports_net_open(current->nsproxy->net_ns, file); } -static const struct file_operations exports_proc_operations = { - .open = exports_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops exports_proc_ops = { + .proc_open = exports_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int exports_nfsd_open(struct inode *inode, struct file *file) @@ -1431,8 +1431,7 @@ static int create_proc_exports_entry(void) entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; - entry = proc_create("exports", 0, entry, - &exports_proc_operations); + entry = proc_create("exports", 0, entry, &exports_proc_ops); if (!entry) { remove_proc_entry("fs/nfs", NULL); return -ENOMEM; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 9bce3b913189..b1bc582b0493 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -84,17 +84,17 @@ static int nfsd_proc_open(struct inode *inode, struct file *file) return single_open(file, nfsd_proc_show, NULL); } -static const struct file_operations nfsd_proc_fops = { - .open = nfsd_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops nfsd_proc_ops = { + .proc_open = nfsd_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; void nfsd_stat_init(void) { - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops); + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); } void diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 6c7388430ad3..d4359a1df3d5 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2899,18 +2899,12 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) ia_valid |= ATTR_MTIME | ATTR_CTIME; } } - if (ia_valid & ATTR_ATIME) { - vi->i_atime = timestamp_truncate(attr->ia_atime, - vi); - } - if (ia_valid & ATTR_MTIME) { - vi->i_mtime = timestamp_truncate(attr->ia_mtime, - vi); - } - if (ia_valid & ATTR_CTIME) { - vi->i_ctime = timestamp_truncate(attr->ia_ctime, - vi); - } + if (ia_valid & ATTR_ATIME) + vi->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + vi->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + vi->i_ctime = attr->ia_ctime; mark_inode_dirty(vi); out: return err; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9876db52913a..6cd5e4924e4d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2101,17 +2101,15 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, struct buffer_head **di_bh, int meta_level, - int overwrite_io, int write_sem, int wait) { int ret = 0; if (wait) - ret = ocfs2_inode_lock(inode, NULL, meta_level); + ret = ocfs2_inode_lock(inode, di_bh, meta_level); else - ret = ocfs2_try_inode_lock(inode, - overwrite_io ? NULL : di_bh, meta_level); + ret = ocfs2_try_inode_lock(inode, di_bh, meta_level); if (ret < 0) goto out; @@ -2136,6 +2134,7 @@ static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, out_unlock: brelse(*di_bh); + *di_bh = NULL; ocfs2_inode_unlock(inode, meta_level); out: return ret; @@ -2177,7 +2176,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, ret = ocfs2_inode_lock_for_extent_tree(inode, &di_bh, meta_level, - overwrite_io, write_sem, wait); if (ret < 0) { @@ -2233,13 +2231,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, &di_bh, meta_level, write_sem); + meta_level = 1; + write_sem = 1; ret = ocfs2_inode_lock_for_extent_tree(inode, &di_bh, meta_level, - overwrite_io, - 1, + write_sem, wait); - write_sem = 1; if (ret < 0) { if (ret != -EAGAIN) mlog_errno(ret); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4180c3ef0a68..939df99d2dec 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -696,7 +696,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); - if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + if (PTR_ERR(bg_bh) == -ENOSPC) bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 6220642fe113..9fc47c2e078d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -24,7 +24,7 @@ static int ovl_ccup_set(const char *buf, const struct kernel_param *param) { - pr_warn("overlayfs: \"check_copy_up\" module option is obsolete\n"); + pr_warn("\"check_copy_up\" module option is obsolete\n"); return 0; } @@ -123,6 +123,9 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) loff_t old_pos = 0; loff_t new_pos = 0; loff_t cloned; + loff_t data_pos = -1; + loff_t hole_len; + bool skip_hole = false; int error = 0; if (len == 0) @@ -144,7 +147,11 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) goto out; /* Couldn't clone, so now we try to copy the data */ - /* FIXME: copy up sparse files efficiently */ + /* Check if lower fs supports seek operation */ + if (old_file->f_mode & FMODE_LSEEK && + old_file->f_op->llseek) + skip_hole = true; + while (len) { size_t this_len = OVL_COPY_UP_CHUNK_SIZE; long bytes; @@ -157,6 +164,36 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) break; } + /* + * Fill zero for hole will cost unnecessary disk space + * and meanwhile slow down the copy-up speed, so we do + * an optimization for hole during copy-up, it relies + * on SEEK_DATA implementation in lower fs so if lower + * fs does not support it, copy-up will behave as before. + * + * Detail logic of hole detection as below: + * When we detect next data position is larger than current + * position we will skip that hole, otherwise we copy + * data in the size of OVL_COPY_UP_CHUNK_SIZE. Actually, + * it may not recognize all kind of holes and sometimes + * only skips partial of hole area. However, it will be + * enough for most of the use cases. + */ + + if (skip_hole && data_pos < old_pos) { + data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA); + if (data_pos > old_pos) { + hole_len = data_pos - old_pos; + len -= hole_len; + old_pos = new_pos = data_pos; + continue; + } else if (data_pos == -ENXIO) { + break; + } else if (data_pos < 0) { + skip_hole = false; + } + } + bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); @@ -480,7 +517,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) } inode_lock(temp->d_inode); - if (c->metacopy) + if (S_ISREG(c->stat.mode)) err = ovl_set_size(temp, &c->stat); if (!err) err = ovl_set_attr(temp, &c->stat); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 29abdb1d3b5c..8e57d5372b8f 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -35,7 +35,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) dput(wdentry); if (err) { - pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + pr_err("cleanup of '%pd2' failed (%i)\n", wdentry, err); } @@ -53,7 +53,7 @@ static struct dentry *ovl_lookup_temp(struct dentry *workdir) temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { - pr_err("overlayfs: workdir/%s already exists\n", name); + pr_err("workdir/%s already exists\n", name); dput(temp); temp = ERR_PTR(-EIO); } @@ -134,7 +134,7 @@ static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, d = lookup_one_len(dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { - pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n", + pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", dentry, err); return PTR_ERR(d); } @@ -267,7 +267,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); if (inode != oip.newinode) { - pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n", + pr_warn_ratelimited("newly created inode found in cache (%pd2)\n", dentry); } @@ -1009,7 +1009,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) spin_unlock(&dentry->d_lock); } else { kfree(redirect); - pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + pr_warn_ratelimited("failed to set redirect (%i)\n", err); /* Fall back to userspace copy-up */ err = -EXDEV; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 70e55588aedc..6f54d70cef27 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -30,7 +30,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry) } if (err) { - pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n", + pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); } @@ -244,7 +244,7 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", + pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", dentry, err, buflen, fh ? (int)fh->fb.len : 0, fh ? fh->fb.type : 0); goto out; @@ -358,7 +358,7 @@ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) */ static struct dentry *ovl_lookup_real_one(struct dentry *connected, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct inode *dir = d_inode(connected); struct dentry *this, *parent = NULL; @@ -406,7 +406,7 @@ out: return this; fail: - pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); this = ERR_PTR(err); goto out; @@ -414,17 +414,16 @@ fail: static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer); + const struct ovl_layer *layer); /* * Lookup an indexed or hashed overlay dentry by real inode. */ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct ovl_fs *ofs = sb->s_fs_info; - struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; struct dentry *index = NULL; struct dentry *this = NULL; struct inode *inode; @@ -466,7 +465,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, * recursive call walks back from indexed upper to the topmost * connected/hashed upper parent (or up to root). */ - this = ovl_lookup_real(sb, upper, &upper_layer); + this = ovl_lookup_real(sb, upper, &ofs->layers[0]); dput(upper); } @@ -487,7 +486,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, */ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct dentry *next, *parent = NULL; struct dentry *ancestor = ERR_PTR(-EIO); @@ -540,7 +539,7 @@ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, */ static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct dentry *connected; int err = 0; @@ -631,7 +630,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb, return connected; fail: - pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); dput(connected); return ERR_PTR(err); @@ -646,8 +645,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb, struct dentry *index) { struct ovl_fs *ofs = sb->s_fs_info; - struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; - struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer; + const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; struct dentry *real = upper ?: (index ?: lowerpath->dentry); /* @@ -822,7 +820,7 @@ out: return dentry; out_err: - pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", + pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", fh_len, fh_type, flags, err); dentry = ERR_PTR(err); goto out; @@ -831,7 +829,7 @@ out_err: static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); + pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); return ERR_PTR(-EACCES); } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index e235a635d9ec..a5317216de73 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,8 +9,19 @@ #include <linux/xattr.h> #include <linux/uio.h> #include <linux/uaccess.h> +#include <linux/splice.h> +#include <linux/mm.h> +#include <linux/fs.h> #include "overlayfs.h" +struct ovl_aio_req { + struct kiocb iocb; + struct kiocb *orig_iocb; + struct fd fd; +}; + +static struct kmem_cache *ovl_aio_request_cachep; + static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -146,7 +157,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); struct fd real; const struct cred *old_cred; - ssize_t ret; + loff_t ret; /* * The two special cases below do not need to involve real fs, @@ -171,7 +182,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) * limitations that are more strict than ->s_maxbytes for specific * files, so we use the real file to perform seeks. */ - inode_lock(inode); + ovl_inode_lock(inode); real.file->f_pos = file->f_pos; old_cred = ovl_override_creds(inode->i_sb); @@ -179,7 +190,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) revert_creds(old_cred); file->f_pos = real.file->f_pos; - inode_unlock(inode); + ovl_inode_unlock(inode); fdput(real); @@ -225,6 +236,33 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) return flags; } +static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) +{ + struct kiocb *iocb = &aio_req->iocb; + struct kiocb *orig_iocb = aio_req->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(orig_iocb->ki_filp); + + file_end_write(iocb->ki_filp); + ovl_copyattr(ovl_inode_real(inode), inode); + } + + orig_iocb->ki_pos = iocb->ki_pos; + fdput(aio_req->fd); + kmem_cache_free(ovl_aio_request_cachep, aio_req); +} + +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) +{ + struct ovl_aio_req *aio_req = container_of(iocb, + struct ovl_aio_req, iocb); + struct kiocb *orig_iocb = aio_req->orig_iocb; + + ovl_aio_cleanup_handler(aio_req); + orig_iocb->ki_complete(orig_iocb, res, res2); +} + static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -240,10 +278,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + if (is_sync_kiocb(iocb)) { + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + aio_req->fd = real; + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: revert_creds(old_cred); - ovl_file_accessed(file); fdput(real); @@ -274,15 +330,33 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); - file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); - file_end_write(real.file); + if (is_sync_kiocb(iocb)) { + file_start_write(real.file); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + file_start_write(real.file); + aio_req->fd = real; + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: revert_creds(old_cred); - - /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); - fdput(real); out_unlock: @@ -291,6 +365,48 @@ out_unlock: return ret; } +static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct fd real; + const struct cred *old_cred; + + ret = ovl_real_fdget(in, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(in)->i_sb); + ret = generic_file_splice_read(real.file, ppos, pipe, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(in); + fdput(real); + return ret; +} + +static ssize_t +ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + ret = ovl_real_fdget(out, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(out)->i_sb); + ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(out); + fdput(real); + return ret; +} + static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -647,7 +763,25 @@ const struct file_operations ovl_file_operations = { .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, + .splice_read = ovl_splice_read, + .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; + +int __init ovl_aio_request_cache_init(void) +{ + ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", + sizeof(struct ovl_aio_req), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ovl_aio_request_cachep) + return -ENOMEM; + + return 0; +} + +void ovl_aio_request_cache_destroy(void) +{ + kmem_cache_destroy(ovl_aio_request_cachep); +} diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b045cf1826fc..79e8994e3bc1 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -75,10 +75,9 @@ out: return err; } -static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, - struct ovl_layer *lower_layer) +static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { - bool samefs = ovl_same_sb(dentry->d_sb); + bool samefs = ovl_same_fs(dentry->d_sb); unsigned int xinobits = ovl_xino_bits(dentry->d_sb); if (samefs) { @@ -100,12 +99,10 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, * persistent for a given layer configuration. */ if (stat->ino >> shift) { - pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", dentry, stat->ino, xinobits); } else { - if (lower_layer) - stat->ino |= ((u64)lower_layer->fsid) << shift; - + stat->ino |= ((u64)fsid) << shift; stat->dev = dentry->d_sb->s_dev; return 0; } @@ -124,15 +121,14 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; - } else if (lower_layer && lower_layer->fsid) { + } else { /* * For non-samefs setup, if we cannot map all layers st_ino * to a unified address space, we need to make sure that st_dev - * is unique per lower fs. Upper layer uses real st_dev and - * lower layers use the unique anonymous bdev assigned to the - * lower fs. + * is unique per underlying fs, so we use the unique anonymous + * bdev assigned to the underlying fs. */ - stat->dev = lower_layer->fs->pseudo_dev; + stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; } return 0; @@ -146,8 +142,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, struct path realpath; const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); - bool samefs = ovl_same_sb(dentry->d_sb); - struct ovl_layer *lower_layer = NULL; + int fsid = 0; int err; bool metacopy_blocks = false; @@ -168,9 +163,9 @@ int ovl_getattr(const struct path *path, struct kstat *stat, * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ - if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) { + if (!is_dir || ovl_same_dev(dentry->d_sb)) { if (!OVL_TYPE_UPPER(type)) { - lower_layer = ovl_layer_lower(dentry); + fsid = ovl_layer_lower(dentry)->fsid; } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | STATX_BLOCKS | @@ -200,14 +195,8 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { - lower_layer = ovl_layer_lower(dentry); - /* - * Cannot use origin st_dev;st_ino because - * origin inode content may differ from overlay - * inode content. - */ - if (samefs || lower_layer->fsid) - stat->ino = lowerstat.ino; + fsid = ovl_layer_lower(dentry)->fsid; + stat->ino = lowerstat.ino; } /* @@ -241,7 +230,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, } } - err = ovl_map_dev_ino(dentry, stat, lower_layer); + err = ovl_map_dev_ino(dentry, stat, fsid); if (err) goto out; @@ -527,6 +516,27 @@ static const struct address_space_operations ovl_aops = { * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) * [...] &type->i_mutex_dir_key (stack_depth=0) + * + * Locking order w.r.t ovl_want_write() is important for nested overlayfs. + * + * This chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * And this chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - lowerinode->i_rwsem (inode_lock[1]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is + * held, because it is in reverse order of the non-nested case using the same + * upper fs: + * - inode->i_rwsem (inode_lock[1]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[1]) */ #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH @@ -565,7 +575,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). */ - if (ovl_same_sb(inode->i_sb) || xinobits) { + if (ovl_same_dev(inode->i_sb)) { inode->i_ino = ino; if (xinobits && fsid && !(ino >> (64 - xinobits))) inode->i_ino |= (unsigned long)fsid << (64 - xinobits); @@ -698,7 +708,7 @@ unsigned int ovl_get_nlink(struct dentry *lowerdentry, return nlink; fail: - pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n", + pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", upperdentry, err); return fallback; } @@ -969,7 +979,7 @@ out: return inode; out_err: - pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err); + pr_warn_ratelimited("failed to get inode (%i)\n", err); inode = ERR_PTR(err); goto out; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 76ff66339173..ed9e129fae04 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -141,10 +141,10 @@ out: return NULL; fail: - pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res); + pr_warn_ratelimited("failed to get origin (%i)\n", res); goto out; invalid: - pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh); + pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh); goto out; } @@ -322,16 +322,16 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *origin = NULL; int i; - for (i = 0; i < ofs->numlower; i++) { + for (i = 1; i < ofs->numlayer; i++) { /* * If lower fs uuid is not unique among lower fs we cannot match * fh->uuid to layer. */ - if (ofs->lower_layers[i].fsid && - ofs->lower_layers[i].fs->bad_uuid) + if (ofs->layers[i].fsid && + ofs->layers[i].fs->bad_uuid) continue; - origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, + origin = ovl_decode_real_fh(fh, ofs->layers[i].mnt, connected); if (origin) break; @@ -354,13 +354,13 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, } **stackp = (struct ovl_path){ .dentry = origin, - .layer = &ofs->lower_layers[i] + .layer = &ofs->layers[i] }; return 0; invalid: - pr_warn_ratelimited("overlayfs: invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", + pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); @@ -449,7 +449,7 @@ out: fail: inode = d_inode(real); - pr_warn_ratelimited("overlayfs: failed to verify %s (%pd2, ino=%lu, err=%i)\n", + pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; @@ -475,7 +475,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) return upper ?: ERR_PTR(-ESTALE); if (!d_is_dir(upper)) { - pr_warn_ratelimited("overlayfs: invalid index upper (%pd2, upper=%pd2).\n", + pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n", index, upper); dput(upper); return ERR_PTR(-EIO); @@ -589,12 +589,12 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n", + pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n", index, d_inode(index)->i_mode & S_IFMT, err); goto out; orphan: - pr_warn_ratelimited("overlayfs: orphan index entry (%pd2, ftype=%x, nlink=%u)\n", + pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(index)->i_nlink); err = -ENOENT; @@ -696,7 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" + pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); @@ -723,13 +723,13 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, * unlinked, which means that finding a lower origin on lookup * whose index is a whiteout should be treated as an error. */ - pr_warn_ratelimited("overlayfs: bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", + pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); goto fail; } else if (is_dir && verify) { if (!upper) { - pr_warn_ratelimited("overlayfs: suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", + pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", origin, index); goto fail; } @@ -738,7 +738,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, err = ovl_verify_upper(index, upper, false); if (err) { if (err == -ESTALE) { - pr_warn_ratelimited("overlayfs: suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", + pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", upper, origin, index); } goto fail; @@ -885,7 +885,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!d.stop && poe->numlower) { err = -ENOMEM; - stack = kcalloc(ofs->numlower, sizeof(struct ovl_path), + stack = kcalloc(ofs->numlayer - 1, sizeof(struct ovl_path), GFP_KERNEL); if (!stack) goto out_put_upper; @@ -967,7 +967,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, */ err = -EPERM; if (d.redirect && !ofs->config.redirect_follow) { - pr_warn_ratelimited("overlayfs: refusing to follow redirect for (%pd2)\n", + pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", dentry); goto out_put; } @@ -994,7 +994,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = -EPERM; if (!ofs->config.metacopy) { - pr_warn_ratelimited("overlay: refusing to follow metacopy origin for (%pd2)\n", + pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); goto out_put; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index f283b1d69a9e..3623d28aa4fa 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -9,6 +9,9 @@ #include <linux/fs.h> #include "ovl_entry.h" +#undef pr_fmt +#define pr_fmt(fmt) "overlayfs: " fmt + enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), @@ -221,7 +224,6 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); -struct super_block *ovl_same_sb(struct super_block *sb); int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -237,7 +239,7 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); -struct ovl_layer *ovl_layer_lower(struct dentry *dentry); +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); @@ -299,11 +301,21 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } -static inline unsigned int ovl_xino_bits(struct super_block *sb) +/* All layers on same fs? */ +static inline bool ovl_same_fs(struct super_block *sb) +{ + return OVL_FS(sb)->xino_mode == 0; +} + +/* All overlay inodes have same st_dev? */ +static inline bool ovl_same_dev(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + return OVL_FS(sb)->xino_mode >= 0; +} - return ofs->xino_bits; +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0; } static inline int ovl_inode_lock(struct inode *inode) @@ -438,6 +450,8 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ extern const struct file_operations ovl_file_operations; +int __init ovl_aio_request_cache_init(void); +void ovl_aio_request_cache_destroy(void); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 28348c44ea5b..89015ea822e7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -24,6 +24,8 @@ struct ovl_sb { dev_t pseudo_dev; /* Unusable (conflicting) uuid */ bool bad_uuid; + /* Used as a lower layer (but maybe also as upper) */ + bool is_lower; }; struct ovl_layer { @@ -38,18 +40,18 @@ struct ovl_layer { }; struct ovl_path { - struct ovl_layer *layer; + const struct ovl_layer *layer; struct dentry *dentry; }; /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - unsigned int numlower; - /* Number of unique lower sb that differ from upper sb */ - unsigned int numlowerfs; - struct ovl_layer *lower_layers; - struct ovl_sb *lower_fs; + unsigned int numlayer; + /* Number of unique fs among layers including upper fs */ + unsigned int numfs; + const struct ovl_layer *layers; + struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -71,10 +73,15 @@ struct ovl_fs { struct inode *workbasedir_trap; struct inode *workdir_trap; struct inode *indexdir_trap; - /* Inode numbers in all layers do not use the high xino_bits */ - unsigned int xino_bits; + /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ + int xino_mode; }; +static inline struct ovl_fs *OVL_FS(struct super_block *sb) +{ + return (struct ovl_fs *)sb->s_fs_info; +} + /* private information held for every overlayfs dentry */ struct ovl_entry { union { diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 47a91c9733a5..40ac9ce2465a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -441,7 +441,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, const char *name, int namelen) { if (ino >> (64 - xinobits)) { - pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", namelen, name, ino, xinobits); return ino; } @@ -469,7 +469,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) int xinobits = ovl_xino_bits(dir->d_sb); int err = 0; - if (!ovl_same_sb(dir->d_sb) && !xinobits) + if (!ovl_same_dev(dir->d_sb)) goto out; if (p->name[0] == '.') { @@ -504,7 +504,13 @@ get: if (err) goto fail; - WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + /* + * Directory inode is always on overlay st_dev. + * Non-dir with ovl_same_dev() could be on pseudo st_dev in case + * of xino bits overflow. + */ + WARN_ON_ONCE(S_ISDIR(stat.mode) && + dir->d_sb->s_dev != stat.dev); ino = stat.ino; } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, @@ -518,7 +524,7 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", + pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } @@ -685,7 +691,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; - struct ovl_layer *lower_layer = ovl_layer_lower(dir); + const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, @@ -738,7 +744,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) * entries. */ if (ovl_xino_bits(dentry->d_sb) || - (ovl_same_sb(dentry->d_sb) && + (ovl_same_fs(dentry->d_sb) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { return ovl_iterate_real(file, ctx); @@ -965,7 +971,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) dentry = lookup_one_len(p->name, upper, p->len); if (IS_ERR(dentry)) { - pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, (int) PTR_ERR(dentry)); continue; @@ -1147,6 +1153,6 @@ next: out: ovl_cache_free(&list); if (err) - pr_err("overlayfs: failed index dir cleanup (%i)\n", err); + pr_err("failed index dir cleanup (%i)\n", err); return err; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7621ff176d15..319fe0d355b0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -224,14 +224,14 @@ static void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ofs->upper_mnt->mnt_root); mntput(ofs->upper_mnt); - for (i = 0; i < ofs->numlower; i++) { - iput(ofs->lower_layers[i].trap); - mntput(ofs->lower_layers[i].mnt); + for (i = 1; i < ofs->numlayer; i++) { + iput(ofs->layers[i].trap); + mntput(ofs->layers[i].mnt); } - for (i = 0; i < ofs->numlowerfs; i++) - free_anon_bdev(ofs->lower_fs[i].pseudo_dev); - kfree(ofs->lower_layers); - kfree(ofs->lower_fs); + kfree(ofs->layers); + for (i = 0; i < ofs->numfs; i++) + free_anon_bdev(ofs->fs[i].pseudo_dev); + kfree(ofs->fs); kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); @@ -358,7 +358,7 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); - if (ofs->config.xino != ovl_xino_def()) + if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb)) seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", @@ -462,7 +462,7 @@ static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) if (ovl_redirect_always_follow) config->redirect_follow = true; } else if (strcmp(mode, "nofollow") != 0) { - pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + pr_err("bad mount option \"redirect_dir=%s\"\n", mode); return -EINVAL; } @@ -560,14 +560,15 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: - pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); + pr_err("unrecognized mount option \"%s\" or missing value\n", + p); return -EINVAL; } } /* Workdir is useless in non-upper mount */ if (!config->upperdir && config->workdir) { - pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", config->workdir); kfree(config->workdir); config->workdir = NULL; @@ -587,7 +588,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) /* Resolve metacopy -> redirect_dir dependency */ if (config->metacopy && !config->redirect_dir) { if (metacopy_opt && redirect_opt) { - pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n", + pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", config->redirect_mode); return -EINVAL; } @@ -596,7 +597,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) * There was an explicit redirect_dir=... that resulted * in this conflict. */ - pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n", + pr_info("disabling metacopy due to redirect_dir=%s\n", config->redirect_mode); config->metacopy = false; } else { @@ -692,7 +693,7 @@ out_unlock: out_dput: dput(work); out_err: - pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n", ofs->config.workdir, name, -err); work = NULL; goto out_unlock; @@ -716,21 +717,21 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) int err = -EINVAL; if (!*name) { - pr_err("overlayfs: empty lowerdir\n"); + pr_err("empty lowerdir\n"); goto out; } err = kern_path(name, LOOKUP_FOLLOW, path); if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + pr_err("failed to resolve '%s': %i\n", name, err); goto out; } err = -EINVAL; if (ovl_dentry_weird(path->dentry)) { - pr_err("overlayfs: filesystem on '%s' not supported\n", name); + pr_err("filesystem on '%s' not supported\n", name); goto out_put; } if (!d_is_dir(path->dentry)) { - pr_err("overlayfs: '%s' not a directory\n", name); + pr_err("'%s' not a directory\n", name); goto out_put; } return 0; @@ -752,7 +753,7 @@ static int ovl_mount_dir(const char *name, struct path *path) if (!err) if (ovl_dentry_remote(path->dentry)) { - pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + pr_err("filesystem on '%s' not supported as upperdir\n", tmp); path_put_init(path); err = -EINVAL; @@ -769,7 +770,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, int err = vfs_statfs(path, &statfs); if (err) - pr_err("overlayfs: statfs failed on '%s'\n", name); + pr_err("statfs failed on '%s'\n", name); else ofs->namelen = max(ofs->namelen, statfs.f_namelen); @@ -804,13 +805,13 @@ static int ovl_lower_dir(const char *name, struct path *path, (ofs->config.index && ofs->config.upperdir)) && !fh_type) { ofs->config.index = false; ofs->config.nfs_export = false; - pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", + pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } /* Check if lower fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) - ofs->xino_bits = 0; + ofs->xino_mode = -1; return 0; @@ -996,7 +997,7 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, err = PTR_ERR_OR_ZERO(trap); if (err) { if (err == -ELOOP) - pr_err("overlayfs: conflicting %s path\n", name); + pr_err("conflicting %s path\n", name); return err; } @@ -1013,11 +1014,11 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) { if (ofs->config.index) { - pr_err("overlayfs: %s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", + pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", name); return -EBUSY; } else { - pr_warn("overlayfs: %s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", + pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", name); return 0; } @@ -1035,7 +1036,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, /* Upper fs should not be r/o */ if (sb_rdonly(upperpath->mnt->mnt_sb)) { - pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + pr_err("upper fs is r/o, try multi-lower layers mount\n"); err = -EINVAL; goto out; } @@ -1052,7 +1053,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_mnt = clone_private_mount(upperpath); err = PTR_ERR(upper_mnt); if (IS_ERR(upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); + pr_err("failed to clone upperpath\n"); goto out; } @@ -1108,7 +1109,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, * kernel upgrade. So warn instead of erroring out. */ if (!err) - pr_warn("overlayfs: upper fs needs to support d_type.\n"); + pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); @@ -1116,7 +1117,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, if (ofs->tmpfile) dput(temp); else - pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + pr_warn("upper fs does not support tmpfile.\n"); /* * Check if upper/work fs supports trusted.overlay.* xattr @@ -1126,7 +1127,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->noxattr = true; ofs->config.index = false; ofs->config.metacopy = false; - pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); + pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); err = 0; } else { vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); @@ -1136,16 +1137,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { ofs->config.index = false; - pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); + pr_warn("upper fs does not support file handles, falling back to index=off.\n"); } /* Check if upper fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) - ofs->xino_bits = 0; + ofs->xino_mode = -1; /* NFS export of r/w mount depends on index */ if (ofs->config.nfs_export && !ofs->config.index) { - pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); + pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } out: @@ -1165,11 +1166,11 @@ static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, err = -EINVAL; if (upperpath->mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + pr_err("workdir and upperdir must reside under the same mount\n"); goto out; } if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + pr_err("workdir and upperdir must be separate subtrees\n"); goto out; } @@ -1210,7 +1211,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, true); if (err) { - pr_err("overlayfs: failed to verify upper root origin\n"); + pr_err("failed to verify upper root origin\n"); goto out; } @@ -1233,18 +1234,18 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN, upperpath->dentry, true, false); if (err) - pr_err("overlayfs: failed to verify index dir 'origin' xattr\n"); + pr_err("failed to verify index dir 'origin' xattr\n"); } err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true); if (err) - pr_err("overlayfs: failed to verify index dir 'upper' xattr\n"); + pr_err("failed to verify index dir 'upper' xattr\n"); /* Cleanup bad/stale/orphan index entries */ if (!err) err = ovl_indexdir_cleanup(ofs); } if (err || !ofs->indexdir) - pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); out: mnt_drop_write(mnt); @@ -1258,7 +1259,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) if (!ofs->config.nfs_export && !ofs->upper_mnt) return true; - for (i = 0; i < ofs->numlowerfs; i++) { + for (i = 0; i < ofs->numfs; i++) { /* * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long @@ -1266,8 +1267,9 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * if we detect multiple lower fs with the same uuid, we * disable lower file handle decoding on all of them. */ - if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) { - ofs->lower_fs[i].bad_uuid = true; + if (ofs->fs[i].is_lower && + uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) { + ofs->fs[i].bad_uuid = true; return false; } } @@ -1283,13 +1285,9 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) int err; bool bad_uuid = false; - /* fsid 0 is reserved for upper fs even with non upper overlay */ - if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) - return 0; - - for (i = 0; i < ofs->numlowerfs; i++) { - if (ofs->lower_fs[i].sb == sb) - return i + 1; + for (i = 0; i < ofs->numfs; i++) { + if (ofs->fs[i].sb == sb) + return i; } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { @@ -1297,7 +1295,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) if (ofs->config.index || ofs->config.nfs_export) { ofs->config.index = false; ofs->config.nfs_export = false; - pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", path->dentry); @@ -1306,35 +1304,59 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) err = get_anon_bdev(&dev); if (err) { - pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + pr_err("failed to get anonymous bdev for lowerpath\n"); return err; } - ofs->lower_fs[ofs->numlowerfs].sb = sb; - ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; - ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid; - ofs->numlowerfs++; + ofs->fs[ofs->numfs].sb = sb; + ofs->fs[ofs->numfs].pseudo_dev = dev; + ofs->fs[ofs->numfs].bad_uuid = bad_uuid; - return ofs->numlowerfs; + return ofs->numfs++; } -static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, - struct path *stack, unsigned int numlower) +static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, + struct path *stack, unsigned int numlower) { int err; unsigned int i; + struct ovl_layer *layers; err = -ENOMEM; - ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer), - GFP_KERNEL); - if (ofs->lower_layers == NULL) + layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); + if (!layers) goto out; + ofs->layers = layers; - ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb), - GFP_KERNEL); - if (ofs->lower_fs == NULL) + ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL); + if (ofs->fs == NULL) goto out; + /* idx/fsid 0 are reserved for upper fs even with lower only overlay */ + ofs->numfs++; + + layers[0].mnt = ofs->upper_mnt; + layers[0].idx = 0; + layers[0].fsid = 0; + ofs->numlayer = 1; + + /* + * All lower layers that share the same fs as upper layer, use the same + * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower + * only overlay to simplify ovl_fs_free(). + * is_lower will be set if upper fs is shared with a lower layer. + */ + err = get_anon_bdev(&ofs->fs[0].pseudo_dev); + if (err) { + pr_err("failed to get anonymous bdev for upper fs\n"); + goto out; + } + + if (ofs->upper_mnt) { + ofs->fs[0].sb = ofs->upper_mnt->mnt_sb; + ofs->fs[0].is_lower = false; + } + for (i = 0; i < numlower; i++) { struct vfsmount *mnt; struct inode *trap; @@ -1357,7 +1379,7 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, mnt = clone_private_mount(&stack[i]); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); + pr_err("failed to clone lowerpath\n"); iput(trap); goto out; } @@ -1368,15 +1390,13 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, */ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; - ofs->lower_layers[ofs->numlower].trap = trap; - ofs->lower_layers[ofs->numlower].mnt = mnt; - ofs->lower_layers[ofs->numlower].idx = i + 1; - ofs->lower_layers[ofs->numlower].fsid = fsid; - if (fsid) { - ofs->lower_layers[ofs->numlower].fs = - &ofs->lower_fs[fsid - 1]; - } - ofs->numlower++; + layers[ofs->numlayer].trap = trap; + layers[ofs->numlayer].mnt = mnt; + layers[ofs->numlayer].idx = ofs->numlayer; + layers[ofs->numlayer].fsid = fsid; + layers[ofs->numlayer].fs = &ofs->fs[fsid]; + ofs->numlayer++; + ofs->fs[fsid].is_lower = true; } /* @@ -1387,22 +1407,23 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, * bits reserved for fsid, it emits a warning and uses the original * inode number. */ - if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) { - ofs->xino_bits = 0; - ofs->config.xino = OVL_XINO_OFF; - } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) { + if (ofs->numfs - !ofs->upper_mnt == 1) { + if (ofs->config.xino == OVL_XINO_ON) + pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); + ofs->xino_mode = 0; + } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) { /* - * This is a roundup of number of bits needed for numlowerfs+1 - * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for - * upper fs even with non upper overlay. + * This is a roundup of number of bits needed for encoding + * fsid, where fsid 0 is reserved for upper fs even with + * lower only overlay. */ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); - ofs->xino_bits = ilog2(ofs->numlowerfs) + 1; + ofs->xino_mode = ilog2(ofs->numfs - 1) + 1; } - if (ofs->xino_bits) { - pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n", - ofs->xino_bits); + if (ofs->xino_mode > 0) { + pr_info("\"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_mode); } err = 0; @@ -1428,15 +1449,15 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { - pr_err("overlayfs: too many lower directories, limit is %d\n", + pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_err; } else if (!ofs->config.upperdir && stacklen == 1) { - pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); goto out_err; } else if (!ofs->config.upperdir && ofs->config.nfs_export && ofs->config.redirect_follow) { - pr_warn("overlayfs: NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } @@ -1459,11 +1480,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, err = -EINVAL; sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { - pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + pr_err("maximum fs stacking depth exceeded\n"); goto out_err; } - err = ovl_get_lower_layers(sb, ofs, stack, numlower); + err = ovl_get_layers(sb, ofs, stack, numlower); if (err) goto out_err; @@ -1474,7 +1495,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, for (i = 0; i < numlower; i++) { oe->lowerstack[i].dentry = dget(stack[i].dentry); - oe->lowerstack[i].layer = &ofs->lower_layers[i]; + oe->lowerstack[i].layer = &ofs->layers[i+1]; } if (remote) @@ -1515,7 +1536,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, while (!err && parent != next) { if (ovl_lookup_trap_inode(sb, parent)) { err = -ELOOP; - pr_err("overlayfs: overlapping %s path\n", name); + pr_err("overlapping %s path\n", name); } else if (ovl_is_inuse(parent)) { err = ovl_report_in_use(ofs, name); } @@ -1555,9 +1576,9 @@ static int ovl_check_overlapping_layers(struct super_block *sb, return err; } - for (i = 0; i < ofs->numlower; i++) { + for (i = 1; i < ofs->numlayer; i++) { err = ovl_check_layer(sb, ofs, - ofs->lower_layers[i].mnt->mnt_root, + ofs->layers[i].mnt->mnt_root, "lowerdir"); if (err) return err; @@ -1595,7 +1616,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; if (!ofs->config.lowerdir) { if (!silent) - pr_err("overlayfs: missing 'lowerdir'\n"); + pr_err("missing 'lowerdir'\n"); goto out_err; } @@ -1603,14 +1624,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ if (ofs->config.xino != OVL_XINO_OFF) - ofs->xino_bits = BITS_PER_LONG - 32; + ofs->xino_mode = BITS_PER_LONG - 32; /* alloc/destroy_inode needed for setting up traps in inode cache */ sb->s_op = &ovl_super_operations; if (ofs->config.upperdir) { if (!ofs->config.workdir) { - pr_err("overlayfs: missing 'workdir'\n"); + pr_err("missing 'workdir'\n"); goto out_err; } @@ -1660,13 +1681,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs->indexdir) { ofs->config.index = false; if (ofs->upper_mnt && ofs->config.nfs_export) { - pr_warn("overlayfs: NFS export requires an index dir, falling back to nfs_export=off.\n"); + pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } } if (ofs->config.metacopy && ofs->config.nfs_export) { - pr_warn("overlayfs: NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); + pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } @@ -1749,9 +1770,15 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = register_filesystem(&ovl_fs_type); - if (err) - kmem_cache_destroy(ovl_inode_cachep); + err = ovl_aio_request_cache_init(); + if (!err) { + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; + + ovl_aio_request_cache_destroy(); + } + kmem_cache_destroy(ovl_inode_cachep); return err; } @@ -1766,7 +1793,7 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - + ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f5678a3f8350..ea005085803f 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -40,18 +40,6 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } -struct super_block *ovl_same_sb(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - if (!ofs->numlowerfs) - return ofs->upper_mnt->mnt_sb; - else if (ofs->numlowerfs == 1 && !ofs->upper_mnt) - return ofs->lower_fs[0].sb; - else - return NULL; -} - /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. @@ -198,7 +186,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) return oe->numlower ? oe->lowerstack[0].dentry : NULL; } -struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; @@ -576,7 +564,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, err = ovl_do_setxattr(upperdentry, name, value, size, 0); if (err == -EOPNOTSUPP) { - pr_warn("overlayfs: cannot set %s xattr on upper\n", name); + pr_warn("cannot set %s xattr on upper\n", name); ofs->noxattr = true; return xerr; } @@ -700,7 +688,7 @@ static void ovl_cleanup_index(struct dentry *dentry) inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { - pr_warn_ratelimited("overlayfs: cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", + pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", upperdentry, inode->i_ino, inode->i_nlink); /* * We either have a bug with persistent union nlink or a lower @@ -739,7 +727,7 @@ out: return; fail: - pr_err("overlayfs: cleanup index of '%pd2' failed (%i)\n", dentry, err); + pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err); goto out; } @@ -830,7 +818,7 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) err_unlock: unlock_rename(workdir, upperdir); err: - pr_err("overlayfs: failed to lock workdir+upperdir\n"); + pr_err("failed to lock workdir+upperdir\n"); return -EIO; } @@ -852,7 +840,7 @@ int ovl_check_metacopy_xattr(struct dentry *dentry) return 1; out: - pr_warn_ratelimited("overlayfs: failed to get metacopy (%i)\n", res); + pr_warn_ratelimited("failed to get metacopy (%i)\n", res); return res; } @@ -899,7 +887,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, return res; fail: - pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + pr_warn_ratelimited("failed to get xattr %s: err=%zi)\n", name, res); kfree(buf); return res; @@ -931,7 +919,7 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) return buf; invalid: - pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); + pr_warn_ratelimited("invalid redirect (%s)\n", buf); res = -EINVAL; kfree(buf); return ERR_PTR(res); diff --git a/fs/proc/Makefile b/fs/proc/Makefile index ead487e80510..bd08616ed8ba 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -33,3 +33,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o +proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c new file mode 100644 index 000000000000..9955d75c0585 --- /dev/null +++ b/fs/proc/bootconfig.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/bootconfig - Extra boot configuration + */ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/bootconfig.h> +#include <linux/slab.h> + +static char *saved_boot_config; + +static int boot_config_proc_show(struct seq_file *m, void *v) +{ + if (saved_boot_config) + seq_puts(m, saved_boot_config); + return 0; +} + +/* Rest size of buffer */ +#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0) + +/* Return the needed total length if @size is 0 */ +static int __init copy_xbc_key_value_list(char *dst, size_t size) +{ + struct xbc_node *leaf, *vnode; + const char *val; + char *key, *end = dst + size; + int ret = 0; + + key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); + + xbc_for_each_key_value(leaf, val) { + ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX); + if (ret < 0) + break; + ret = snprintf(dst, rest(dst, end), "%s = ", key); + if (ret < 0) + break; + dst += ret; + vnode = xbc_node_get_child(leaf); + if (vnode && xbc_node_is_array(vnode)) { + xbc_array_for_each_value(vnode, val) { + ret = snprintf(dst, rest(dst, end), "\"%s\"%s", + val, vnode->next ? ", " : "\n"); + if (ret < 0) + goto out; + dst += ret; + } + } else { + ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val); + if (ret < 0) + break; + dst += ret; + } + } +out: + kfree(key); + + return ret < 0 ? ret : dst - (end - size); +} + +static int __init proc_boot_config_init(void) +{ + int len; + + len = copy_xbc_key_value_list(NULL, 0); + if (len < 0) + return len; + + if (len > 0) { + saved_boot_config = kzalloc(len + 1, GFP_KERNEL); + if (!saved_boot_config) + return -ENOMEM; + + len = copy_xbc_key_value_list(saved_boot_config, len + 1); + if (len < 0) { + kfree(saved_boot_config); + return len; + } + } + + proc_create_single("bootconfig", 0, NULL, boot_config_proc_show); + + return 0; +} +fs_initcall(proc_boot_config_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 96f1087e372c..c1dea9b8222e 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -16,16 +16,16 @@ static int cpuinfo_open(struct inode *inode, struct file *file) return seq_open(file, &cpuinfo_op); } -static const struct file_operations proc_cpuinfo_operations = { - .open = cpuinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops cpuinfo_proc_ops = { + .proc_open = cpuinfo_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int __init proc_cpuinfo_init(void) { - proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops); return 0; } fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 074e9585c699..3faed94e4b65 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -473,7 +473,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { ent->data = data; - ent->proc_fops = &proc_dir_operations; + ent->proc_dir_ops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; ent = proc_register(parent, ent); } @@ -503,7 +503,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent = __proc_create(&parent, name, mode, 2); if (ent) { ent->data = NULL; - ent->proc_fops = NULL; + ent->proc_dir_ops = NULL; ent->proc_iops = NULL; ent = proc_register(parent, ent); } @@ -533,25 +533,23 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct file_operations *proc_fops, void *data) + const struct proc_ops *proc_ops, void *data) { struct proc_dir_entry *p; - BUG_ON(proc_fops == NULL); - p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = proc_fops; + p->proc_ops = proc_ops; return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct file_operations *proc_fops) + const struct proc_ops *proc_ops) { - return proc_create_data(name, mode, parent, proc_fops, NULL); + return proc_create_data(name, mode, parent, proc_ops, NULL); } EXPORT_SYMBOL(proc_create); @@ -573,11 +571,11 @@ static int proc_seq_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations proc_seq_fops = { - .open = proc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_seq_release, +static const struct proc_ops proc_seq_ops = { + .proc_open = proc_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = proc_seq_release, }; struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, @@ -589,7 +587,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = &proc_seq_fops; + p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); @@ -603,11 +601,11 @@ static int proc_single_open(struct inode *inode, struct file *file) return single_open(file, de->single_show, de->data); } -static const struct file_operations proc_single_fops = { - .open = proc_single_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops proc_single_ops = { + .proc_open = proc_single_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, @@ -619,7 +617,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = &proc_single_fops; + p->proc_ops = &proc_single_ops; p->single_show = show; return proc_register(parent, p); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index dbe43a50caf2..6da18316d209 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -163,7 +163,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; - pde->proc_fops->release(file_inode(file), file); + pde->proc_ops->proc_release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); @@ -200,12 +200,12 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (use_pde(pde)) { - typeof_member(struct file_operations, llseek) llseek; + typeof_member(struct proc_ops, proc_lseek) lseek; - llseek = pde->proc_fops->llseek; - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + rv = lseek(file, offset, whence); unuse_pde(pde); } return rv; @@ -216,9 +216,9 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, read) read; + typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_fops->read; + read = pde->proc_ops->proc_read; if (read) rv = read(file, buf, count, ppos); unuse_pde(pde); @@ -231,9 +231,9 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, write) write; + typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_fops->write; + write = pde->proc_ops->proc_write; if (write) rv = write(file, buf, count, ppos); unuse_pde(pde); @@ -246,9 +246,9 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; if (use_pde(pde)) { - typeof_member(struct file_operations, poll) poll; + typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_fops->poll; + poll = pde->proc_ops->proc_poll; if (poll) rv = poll(file, pts); unuse_pde(pde); @@ -261,9 +261,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (use_pde(pde)) { - typeof_member(struct file_operations, unlocked_ioctl) ioctl; + typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_fops->unlocked_ioctl; + ioctl = pde->proc_ops->proc_ioctl; if (ioctl) rv = ioctl(file, cmd, arg); unuse_pde(pde); @@ -277,9 +277,9 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (use_pde(pde)) { - typeof_member(struct file_operations, compat_ioctl) compat_ioctl; + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - compat_ioctl = pde->proc_fops->compat_ioctl; + compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) rv = compat_ioctl(file, cmd, arg); unuse_pde(pde); @@ -293,9 +293,9 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, mmap) mmap; + typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_fops->mmap; + mmap = pde->proc_ops->proc_mmap; if (mmap) rv = mmap(file, vma); unuse_pde(pde); @@ -312,9 +312,9 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, get_unmapped_area) get_area; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - get_area = pde->proc_fops->get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU if (!get_area) get_area = current->mm->get_unmapped_area; @@ -333,8 +333,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; - typeof_member(struct file_operations, open) open; - typeof_member(struct file_operations, release) release; + typeof_member(struct proc_ops, proc_open) open; + typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; /* @@ -351,7 +351,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!use_pde(pde)) return -ENOENT; - release = pde->proc_fops->release; + release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { @@ -360,7 +360,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) } } - open = pde->proc_fops->open; + open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); @@ -468,21 +468,23 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); - WARN_ON(!de->proc_iops); - inode->i_op = de->proc_iops; - if (de->proc_fops) { - if (S_ISREG(inode->i_mode)) { + + if (S_ISREG(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (!de->proc_fops->compat_ioctl) - inode->i_fop = - &proc_reg_file_ops_no_compat; - else -#endif - inode->i_fop = &proc_reg_file_ops; - } else { - inode->i_fop = de->proc_fops; + if (!de->proc_ops->proc_compat_ioctl) { + inode->i_fop = &proc_reg_file_ops_no_compat; } - } +#endif + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = de->proc_dir_ops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = NULL; + } else + BUG(); } else pde_put(de); return inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0f3b557c9b77..41587276798e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -39,7 +39,10 @@ struct proc_dir_entry { spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; + union { + const struct proc_ops *proc_ops; + const struct file_operations *proc_dir_ops; + }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e2ed8e08cc7a..8ba492d44e68 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -574,11 +574,11 @@ static int release_kcore(struct inode *inode, struct file *file) return 0; } -static const struct file_operations proc_kcore_operations = { - .read = read_kcore, - .open = open_kcore, - .release = release_kcore, - .llseek = default_llseek, +static const struct proc_ops kcore_proc_ops = { + .proc_read = read_kcore, + .proc_open = open_kcore, + .proc_release = release_kcore, + .proc_lseek = default_llseek, }; /* just remember that we have to update kcore */ @@ -637,8 +637,7 @@ static void __init add_modules_range(void) static int __init proc_kcore_init(void) { - proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, - &proc_kcore_operations); + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops); if (!proc_root_kcore) { pr_err("couldn't create /proc/kcore\n"); return 0; /* Always returns 0. */ diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 4f4a2abb225e..ec1b7d2fb773 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -49,17 +49,17 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) } -static const struct file_operations proc_kmsg_operations = { - .read = kmsg_read, - .poll = kmsg_poll, - .open = kmsg_open, - .release = kmsg_release, - .llseek = generic_file_llseek, +static const struct proc_ops kmsg_proc_ops = { + .proc_read = kmsg_read, + .proc_poll = kmsg_poll, + .proc_open = kmsg_open, + .proc_release = kmsg_release, + .proc_lseek = generic_file_llseek, }; static int __init proc_kmsg_init(void) { - proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops); return 0; } fs_initcall(proc_kmsg_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index 7c952ee732e6..f909243d4a66 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -21,6 +21,21 @@ #define KPMMASK (KPMSIZE - 1) #define KPMBITS (KPMSIZE * BITS_PER_BYTE) +static inline unsigned long get_max_dump_pfn(void) +{ +#ifdef CONFIG_SPARSEMEM + /* + * The memmap of early sections is completely populated and marked + * online even if max_pfn does not fall on a section boundary - + * pfn_to_online_page() will succeed on all pages. Allow inspecting + * these memmaps. + */ + return round_up(max_pfn, PAGES_PER_SECTION); +#else + return max_pfn; +#endif +} + /* /proc/kpagecount - an array exposing page counts * * Each entry is a u64 representing the corresponding @@ -29,6 +44,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -37,9 +53,11 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, u64 pcount; pfn = src / KPMSIZE; - count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -71,9 +89,9 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpagecount_operations = { - .llseek = mem_lseek, - .read = kpagecount_read, +static const struct proc_ops kpagecount_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpagecount_read, }; /* /proc/kpageflags - an array exposing page flags @@ -206,6 +224,7 @@ u64 stable_page_flags(struct page *page) static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -213,9 +232,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, ssize_t ret = 0; pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -242,15 +263,16 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpageflags_operations = { - .llseek = mem_lseek, - .read = kpageflags_read, +static const struct proc_ops kpageflags_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpageflags_read, }; #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -259,9 +281,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, u64 ino; pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -293,18 +317,18 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpagecgroup_operations = { - .llseek = mem_lseek, - .read = kpagecgroup_read, +static const struct proc_ops kpagecgroup_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpagecgroup_read, }; #endif /* CONFIG_MEMCG */ static int __init proc_page_init(void) { - proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); - proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops); + proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops); #ifdef CONFIG_MEMCG - proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); + proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops); #endif return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 76ae278df1c4..4888c5224442 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -90,12 +90,12 @@ static int seq_release_net(struct inode *ino, struct file *f) return 0; } -static const struct file_operations proc_net_seq_fops = { - .open = seq_open_net, - .read = seq_read, - .write = proc_simple_write, - .llseek = seq_lseek, - .release = seq_release_net, +static const struct proc_ops proc_net_seq_ops = { + .proc_open = seq_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release_net, }; struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, @@ -108,7 +108,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_seq_fops; + p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); @@ -152,7 +152,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_seq_fops; + p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; p->write = write; @@ -183,12 +183,12 @@ static int single_release_net(struct inode *ino, struct file *f) return single_release(ino, f); } -static const struct file_operations proc_net_single_fops = { - .open = single_open_net, - .read = seq_read, - .write = proc_simple_write, - .llseek = seq_lseek, - .release = single_release_net, +static const struct proc_ops proc_net_single_ops = { + .proc_open = single_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = single_release_net, }; struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, @@ -201,7 +201,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_single_fops; + p->proc_ops = &proc_net_single_ops; p->single_show = show; return proc_register(parent, p); } @@ -244,7 +244,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_single_fops; + p->proc_ops = &proc_net_single_ops; p->single_show = show; p->write = write; return proc_register(parent, p); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d80989b6c344..c75bb4632ed1 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1720,7 +1720,7 @@ int __init proc_sys_init(void) proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; - proc_sys_root->proc_fops = &proc_sys_dir_file_operations; + proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init(); diff --git a/fs/proc/root.c b/fs/proc/root.c index 0b7c8dffc9ae..72c07a34cff0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -292,7 +292,7 @@ struct proc_dir_entry proc_root = { .nlink = 2, .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, - .proc_fops = &proc_root_operations, + .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", diff --git a/fs/proc/stat.c b/fs/proc/stat.c index fd931d3e77be..0449edf460f5 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -223,16 +223,16 @@ static int stat_open(struct inode *inode, struct file *file) return single_open_size(file, show_stat, NULL, size); } -static const struct file_operations proc_stat_operations = { - .open = stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops stat_proc_ops = { + .proc_open = stat_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static int __init proc_stat_init(void) { - proc_create("stat", 0, NULL, &proc_stat_operations); + proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9442631fd4af..3ba9ae83bff5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -505,7 +505,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; @@ -1282,7 +1282,7 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, } static int pagemap_pte_hole(unsigned long start, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7b13988796e1..7dc800cce354 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -667,10 +667,10 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) } #endif -static const struct file_operations proc_vmcore_operations = { - .read = read_vmcore, - .llseek = default_llseek, - .mmap = mmap_vmcore, +static const struct proc_ops vmcore_proc_ops = { + .proc_read = read_vmcore, + .proc_lseek = default_llseek, + .proc_mmap = mmap_vmcore, }; static struct vmcore* __init get_new_element(void) @@ -1555,7 +1555,7 @@ static int __init vmcore_init(void) elfcorehdr_free(elfcorehdr_addr); elfcorehdr_addr = ELFCORE_ADDR_ERR; - proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops); if (proc_vmcore) proc_vmcore->size = vmcore_size; return 0; diff --git a/fs/read_write.c b/fs/read_write.c index 7458fccc59e1..59d819c5b92e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -939,6 +939,34 @@ out: return ret; } +ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + size_t tot_len; + ssize_t ret = 0; + + if (!file->f_op->read_iter) + return -EINVAL; + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + goto out; + ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); + if (ret < 0) + return ret; + + ret = call_read_iter(file, iocb, iter); +out: + if (ret >= 0) + fsnotify_access(file); + return ret; +} +EXPORT_SYMBOL(vfs_iocb_iter_read); + ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { @@ -975,6 +1003,34 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, return ret; } +ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + size_t tot_len; + ssize_t ret = 0; + + if (!file->f_op->write_iter) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + return 0; + ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); + if (ret < 0) + return ret; + + ret = call_write_iter(file, iocb, iter); + if (ret > 0) + fsnotify_modify(file); + + return ret; +} +EXPORT_SYMBOL(vfs_iocb_iter_write); + ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index d41c21fef138..c4ab045926b7 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -449,7 +449,7 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, } link = kernfs_create_link(kobj->sd, target_name, entry); - if (IS_ERR(link) && PTR_ERR(link) == -EEXIST) + if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, target_name); kernfs_put(entry); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0caa151cae4e..0ee8c6dfb036 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -330,7 +330,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) parent = tracefs_mount->mnt_root; inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); + if (unlikely(IS_DEADDIR(parent->d_inode))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); dentry = ERR_PTR(-EEXIST); @@ -499,122 +502,27 @@ __init struct dentry *tracefs_create_instance_dir(const char *name, return dentry; } -static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) +static void remove_one(struct dentry *victim) { - int ret = 0; - - if (simple_positive(dentry)) { - if (dentry->d_inode) { - dget(dentry); - switch (dentry->d_inode->i_mode & S_IFMT) { - case S_IFDIR: - ret = simple_rmdir(parent->d_inode, dentry); - if (!ret) - fsnotify_rmdir(parent->d_inode, dentry); - break; - default: - simple_unlink(parent->d_inode, dentry); - fsnotify_unlink(parent->d_inode, dentry); - break; - } - if (!ret) - d_delete(dentry); - dput(dentry); - } - } - return ret; -} - -/** - * tracefs_remove - removes a file or directory from the tracefs filesystem - * @dentry: a pointer to a the dentry of the file or directory to be - * removed. - * - * This function removes a file or directory in tracefs that was previously - * created with a call to another tracefs function (like - * tracefs_create_file() or variants thereof.) - */ -void tracefs_remove(struct dentry *dentry) -{ - struct dentry *parent; - int ret; - - if (IS_ERR_OR_NULL(dentry)) - return; - - parent = dentry->d_parent; - inode_lock(parent->d_inode); - ret = __tracefs_remove(dentry, parent); - inode_unlock(parent->d_inode); - if (!ret) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** - * tracefs_remove_recursive - recursively removes a directory + * tracefs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. * * This function recursively removes a directory tree in tracefs that * was previously created with a call to another tracefs function * (like tracefs_create_file() or variants thereof.) */ -void tracefs_remove_recursive(struct dentry *dentry) +void tracefs_remove(struct dentry *dentry) { - struct dentry *child, *parent; - if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry; - down: - inode_lock(parent->d_inode); - loop: - /* - * The parent->d_subdirs is protected by the d_lock. Outside that - * lock, the child can be unlinked and set to be freed which can - * use the d_u.d_child as the rcu head and corrupt this list. - */ - spin_lock(&parent->d_lock); - list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!simple_positive(child)) - continue; - - /* perhaps simple_empty(child) makes more sense */ - if (!list_empty(&child->d_subdirs)) { - spin_unlock(&parent->d_lock); - inode_unlock(parent->d_inode); - parent = child; - goto down; - } - - spin_unlock(&parent->d_lock); - - if (!__tracefs_remove(child, parent)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - - /* - * The parent->d_lock protects agaist child from unlinking - * from d_subdirs. When releasing the parent->d_lock we can - * no longer trust that the next pointer is valid. - * Restart the loop. We'll skip this one with the - * simple_positive() check. - */ - goto loop; - } - spin_unlock(&parent->d_lock); - - inode_unlock(parent->d_inode); - child = parent; - parent = parent->d_parent; - inode_lock(parent->d_inode); - - if (child != dentry) - /* go up */ - goto loop; - - if (!__tracefs_remove(child, parent)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - inode_unlock(parent->d_inode); + simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count); + simple_recursive_removal(dentry, remove_one); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index bc4dec5b1633..743928efffc1 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1080,18 +1080,12 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (attr->ia_valid & ATTR_ATIME) { - inode->i_atime = timestamp_truncate(attr->ia_atime, - inode); - } - if (attr->ia_valid & ATTR_MTIME) { - inode->i_mtime = timestamp_truncate(attr->ia_mtime, - inode); - } - if (attr->ia_valid & ATTR_CTIME) { - inode->i_ctime = timestamp_truncate(attr->ia_ctime, - inode); - } + if (attr->ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 17c90dff7266..4b4b65b48c57 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -84,7 +84,6 @@ static int create_default_filesystem(struct ubifs_info *c) int idx_node_size; long long tmp64, main_bytes; __le64 tmp_le64; - __le32 tmp_le32; struct timespec64 ts; u8 hash[UBIFS_HASH_ARR_SZ]; u8 hash_lpt[UBIFS_HASH_ARR_SZ]; @@ -291,16 +290,14 @@ static int create_default_filesystem(struct ubifs_info *c) ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); ino->nlink = cpu_to_le32(2); - ktime_get_real_ts64(&ts); - ts = timespec64_trunc(ts, DEFAULT_TIME_GRAN); + ktime_get_coarse_real_ts64(&ts); tmp_le64 = cpu_to_le64(ts.tv_sec); ino->atime_sec = tmp_le64; ino->ctime_sec = tmp_le64; ino->mtime_sec = tmp_le64; - tmp_le32 = cpu_to_le32(ts.tv_nsec); - ino->atime_nsec = tmp_le32; - ino->ctime_nsec = tmp_le32; - ino->mtime_nsec = tmp_le32; + ino->atime_nsec = 0; + ino->ctime_nsec = 0; + ino->mtime_nsec = 0; ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/utimes.c b/fs/utimes.c index c952b6b3d8a0..1d17ce98cb80 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -36,14 +36,14 @@ static int utimes_common(const struct path *path, struct timespec64 *times) if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { - newattrs.ia_atime = timestamp_truncate(times[0], inode); + newattrs.ia_atime = times[0]; newattrs.ia_valid |= ATTR_ATIME_SET; } if (times[1].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_MTIME; else if (times[1].tv_nsec != UTIME_NOW) { - newattrs.ia_mtime = timestamp_truncate(times[1], inode); + newattrs.ia_mtime = times[1]; newattrs.ia_valid |= ATTR_MTIME_SET; } /* diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 14fbdf22b7e7..08d6beb54f8c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -23,25 +23,28 @@ #include "xfs_ag_resv.h" #include "xfs_health.h" -static struct xfs_buf * +static int xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + int error; - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0); - if (!bp) - return NULL; + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + if (error) + return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; - return bp; + *bpp = bp; + return 0; } static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) @@ -340,13 +343,13 @@ xfs_ag_init_hdr( struct aghdr_init_data *id, aghdr_init_work_f work, const struct xfs_buf_ops *ops) - { struct xfs_buf *bp; + int error; - bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops); - if (!bp) - return -ENOMEM; + error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops); + if (error) + return error; (*work)(mp, bp, id); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fc93fd88ec89..d8053bc96c4d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1070,11 +1070,11 @@ xfs_alloc_ag_vextent_small( if (args->datatype & XFS_ALLOC_USERDATA) { struct xfs_buf *bp; - bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno); - if (XFS_IS_CORRUPT(args->mp, !bp)) { - error = -EFSCORRUPTED; + error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, + XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), + args->mp->m_bsize, 0, &bp); + if (error) goto error; - } xfs_trans_binval(args->tp, bp); } *fbnop = args->agbno = fbno; @@ -2347,9 +2347,11 @@ xfs_free_agfl_block( if (error) return error; - bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno); - if (XFS_IS_CORRUPT(tp->t_mountp, !bp)) - return -EFSCORRUPTED; + error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, + XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), + tp->t_mountp->m_bsize, 0, &bp); + if (error) + return error; xfs_trans_binval(tp, bp); return 0; @@ -2500,12 +2502,11 @@ xfs_alloc_fix_freelist( if (!pag->pagf_init) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; - if (!pag->pagf_init) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - goto out_agbp_relse; } } @@ -2531,11 +2532,10 @@ xfs_alloc_fix_freelist( */ if (!agbp) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) - goto out_no_agbp; - if (!agbp) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; } } @@ -2766,11 +2766,10 @@ xfs_alloc_pagf_init( xfs_buf_t *bp; int error; - if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) - return error; - if (bp) + error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); + if (!error) xfs_trans_brelse(tp, bp); - return 0; + return error; } /* @@ -2956,14 +2955,11 @@ xfs_read_agf( trace_xfs_read_agf(mp, agno); ASSERT(agno != NULLAGNUMBER); - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; - if (!*bpp) - return 0; ASSERT(!(*bpp)->b_error); xfs_buf_set_ref(*bpp, XFS_AGF_REF); @@ -2987,14 +2983,15 @@ xfs_alloc_read_agf( trace_xfs_alloc_read_agf(mp, agno); + /* We don't support trylock when freeing. */ + ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != + (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); if (error) return error; - if (!*bpp) - return 0; ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index a266d05df146..8b7f74b3bea2 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -418,20 +418,10 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, - &xfs_attr3_rmt_buf_ops); - if (!bp) - return -ENOMEM; - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; + error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, + 0, &bp, &xfs_attr3_rmt_buf_ops); + if (error) return error; - } error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, @@ -555,9 +545,9 @@ xfs_attr_rmtval_set( dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt); - if (!bp) - return -ENOMEM; + error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp); + if (error) + return error; bp->b_ops = &xfs_attr3_rmt_buf_ops; xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4c2e046fbfad..9a6d7a84689a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -730,11 +730,11 @@ xfs_bmap_extents_to_btree( cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno); - if (XFS_IS_CORRUPT(mp, !abp)) { - error = -EFSCORRUPTED; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, args.fsbno), + mp->m_bsize, 0, &abp); + if (error) goto out_unreserve_dquot; - } /* * Fill in the child block. @@ -878,7 +878,11 @@ xfs_bmap_local_to_extents( ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); tp->t_firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno); + error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, + XFS_FSB_TO_DADDR(args.mp, args.fsbno), + args.mp->m_bsize, 0, &bp); + if (error) + goto done; /* * Initialize the block, copy the data and log the remote buffer. @@ -3307,11 +3311,12 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); - if (error) - goto out; - - if (!pag->pagf_init) { - *notinit = 1; + if (error) { + /* Couldn't lock the AGF, so skip this AG. */ + if (error == -EAGAIN) { + *notinit = 1; + error = 0; + } goto out; } } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b22c7e928eb1..fd300dc93ca4 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -679,42 +679,6 @@ xfs_btree_get_block( } /* - * Get a buffer for the block, return it with no data read. - * Long-form addressing. - */ -xfs_buf_t * /* buffer for fsbno */ -xfs_btree_get_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno) /* file system block number */ -{ - xfs_daddr_t d; /* real disk block address */ - - ASSERT(fsbno != NULLFSBLOCK); - d = XFS_FSB_TO_DADDR(mp, fsbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); -} - -/* - * Get a buffer for the block, return it with no data read. - * Short-form addressing. - */ -xfs_buf_t * /* buffer for agno/agbno */ -xfs_btree_get_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno) /* allocation group block number */ -{ - xfs_daddr_t d; /* real disk block address */ - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); -} - -/* * Change the cursor to point to the first record at the given level. * Other levels are unaffected. */ @@ -1270,11 +1234,10 @@ xfs_btree_get_buf_block( error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - - if (!*bpp) - return -ENOMEM; + error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, + 0, bpp); + if (error) + return error; (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index fb9b2121c628..3eff7c321d43 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -297,27 +297,6 @@ xfs_btree_dup_cursor( xfs_btree_cur_t **ncur);/* output cursor */ /* - * Get a buffer for the block, return it with no data read. - * Long-form addressing. - */ -struct xfs_buf * /* buffer for fsbno */ -xfs_btree_get_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno); /* file system block number */ - -/* - * Get a buffer for the block, return it with no data read. - * Short-form addressing. - */ -struct xfs_buf * /* buffer for agno/agbno */ -xfs_btree_get_bufs( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno); /* allocation group block number */ - -/* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 8c3eafe280ed..875e04f82541 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2591,13 +2591,9 @@ xfs_da_get_buf( if (error || nmap == 0) goto out_free; - bp = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0); - error = bp ? bp->b_error : -EIO; - if (error) { - if (bp) - xfs_trans_brelse(tp, bp); + error = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0, &bp); + if (error) goto out_free; - } *bpp = bp; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 5b759af4d165..bf161e930f1d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -276,6 +276,7 @@ xfs_ialloc_inode_init( int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; + int error; /* * Loop over the new block(s), filling in the inodes. For small block @@ -327,12 +328,11 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * M_IGEO(mp)->blocks_per_cluster)); - fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * - M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED); - if (!fbuf) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, + XBF_UNMAPPED, &fbuf); + if (error) + return error; /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index d7d702ee4d1a..6e1665f2cb67 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1177,8 +1177,6 @@ xfs_refcount_finish_one( XFS_ALLOC_FLAG_FREEING, &agbp); if (error) return error; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) - return -EFSCORRUPTED; rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); if (!rcur) { @@ -1718,10 +1716,6 @@ xfs_refcount_recover_cow_leftovers( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) goto out_trans; - if (!agbp) { - error = -ENOMEM; - goto out_trans; - } cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); /* Find all the leftover CoW staging extents. */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 0ac69751fe85..2f60fc3c99a0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -985,9 +985,9 @@ xfs_update_secondary_sbs( for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { struct xfs_buf *bp; - bp = xfs_buf_get(mp->m_ddev_targp, + error = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), - XFS_FSS_TO_BB(mp, 1)); + XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, * continue. xfs_repair chooses the "best" superblock based @@ -995,12 +995,12 @@ xfs_update_secondary_sbs( * superblocks un-updated than updated, and xfs_repair may * pick them over the properly-updated primary. */ - if (!bp) { + if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", agno); if (!saved_error) - saved_error = -ENOMEM; + saved_error = error; continue; } @@ -1185,13 +1185,14 @@ xfs_sb_get_secondary( struct xfs_buf **bpp) { struct xfs_buf *bp; + int error; ASSERT(agno != 0 && agno != NULLAGNUMBER); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (!bp) - return -ENOMEM; + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) + return error; bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); *bpp = bp; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 7a1a38b636a9..d5e6db9af434 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -659,8 +659,6 @@ xrep_agfl( error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; /* * Make sure we have the AGFL buffer, as scrub might have decided it @@ -735,8 +733,6 @@ xrep_agi_find_btrees( error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; /* Find the btree roots. */ error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL); diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 7251c66a82c9..ec2064ed3c30 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -83,9 +83,6 @@ xchk_fscount_warmup( error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); if (error) break; - error = -ENOMEM; - if (!agf_bp || !agi_bp) - break; /* * These are supposed to be initialized by the header read diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index b70a88bc975e..e489d7a8446a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -341,13 +341,17 @@ xrep_init_btblock( struct xfs_trans *tp = sc->tp; struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; + int error; trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), btnum); ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), - XFS_FSB_TO_BB(mp, 1), 0); + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, + &bp); + if (error) + return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); @@ -542,8 +546,6 @@ xrep_reap_block( error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; } else { agf_bp = sc->sa.agf_bp; } diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 8fbb841cd6fe..bbfa6ba84dcd 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -205,11 +205,12 @@ xfs_attr3_node_inactive( /* * Remove the subsidiary block from the cache and from the log. */ - child_bp = xfs_trans_get_buf(*trans, mp->m_ddev_targp, + error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, child_blkno, - XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0); - if (!child_bp) - return -EIO; + XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, + &child_bp); + if (error) + return error; error = bp->b_error; if (error) { xfs_trans_brelse(*trans, child_bp); @@ -298,10 +299,10 @@ xfs_attr3_root_inactive( /* * Invalidate the incore copy of the root block. */ - bp = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno, - XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0); - if (!bp) - return -EIO; + error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno, + XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp); + if (error) + return error; error = bp->b_error; if (error) { xfs_trans_brelse(*trans, bp); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a0229c368e78..217e4f82a44a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -198,20 +198,22 @@ xfs_buf_free_maps( } } -static struct xfs_buf * +static int _xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { struct xfs_buf *bp; int error; int i; + *bpp = NULL; bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) - return NULL; + return -ENOMEM; /* * We don't want certain flags to appear in b_flags unless they are @@ -239,7 +241,7 @@ _xfs_buf_alloc( error = xfs_buf_get_maps(bp, nmaps); if (error) { kmem_cache_free(xfs_buf_zone, bp); - return NULL; + return error; } bp->b_bn = map[0].bm_bn; @@ -256,7 +258,8 @@ _xfs_buf_alloc( XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); - return bp; + *bpp = bp; + return 0; } /* @@ -682,53 +685,39 @@ xfs_buf_incore( * cache hits, as metadata intensive workloads will see 3 orders of magnitude * more hits than misses. */ -struct xfs_buf * +int xfs_buf_get_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; + *bpp = NULL; error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); - - switch (error) { - case 0: - /* cache hit */ + if (!error) goto found; - case -EAGAIN: - /* cache hit, trylock failure, caller handles failure */ - ASSERT(flags & XBF_TRYLOCK); - return NULL; - case -ENOENT: - /* cache miss, go for insert */ - break; - case -EFSCORRUPTED: - default: - /* - * None of the higher layers understand failure types - * yet, so return NULL to signal a fatal lookup error. - */ - return NULL; - } + if (error != -ENOENT) + return error; - new_bp = _xfs_buf_alloc(target, map, nmaps, flags); - if (unlikely(!new_bp)) - return NULL; + error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); + if (error) + return error; error = xfs_buf_allocate_memory(new_bp, flags); if (error) { xfs_buf_free(new_bp); - return NULL; + return error; } error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); if (error) { xfs_buf_free(new_bp); - return NULL; + return error; } if (bp != new_bp) @@ -741,7 +730,7 @@ found: xfs_warn(target->bt_mount, "%s: failed to map pagesn", __func__); xfs_buf_relse(bp); - return NULL; + return error; } } @@ -754,7 +743,8 @@ found: XFS_STATS_INC(target->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); - return bp; + *bpp = bp; + return 0; } STATIC int @@ -806,46 +796,77 @@ xfs_buf_reverify( return bp->b_error; } -xfs_buf_t * +int xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - const struct xfs_buf_ops *ops) + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops, + xfs_failaddr_t fa) { struct xfs_buf *bp; + int error; flags |= XBF_READ; + *bpp = NULL; - bp = xfs_buf_get_map(target, map, nmaps, flags); - if (!bp) - return NULL; + error = xfs_buf_get_map(target, map, nmaps, flags, &bp); + if (error) + return error; trace_xfs_buf_read(bp, flags, _RET_IP_); if (!(bp->b_flags & XBF_DONE)) { + /* Initiate the buffer read and wait. */ XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; - _xfs_buf_read(bp, flags); - return bp; + error = _xfs_buf_read(bp, flags); + + /* Readahead iodone already dropped the buffer, so exit. */ + if (flags & XBF_ASYNC) + return 0; + } else { + /* Buffer already read; all we need to do is check it. */ + error = xfs_buf_reverify(bp, ops); + + /* Readahead already finished; drop the buffer and exit. */ + if (flags & XBF_ASYNC) { + xfs_buf_relse(bp); + return 0; + } + + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + ASSERT(bp->b_ops != NULL || ops == NULL); } - xfs_buf_reverify(bp, ops); + /* + * If we've had a read error, then the contents of the buffer are + * invalid and should not be used. To ensure that a followup read tries + * to pull the buffer from disk again, we clear the XBF_DONE flag and + * mark the buffer stale. This ensures that anyone who has a current + * reference to the buffer will interpret it's contents correctly and + * future cache lookups will also treat it as an empty, uninitialised + * buffer. + */ + if (error) { + if (!XFS_FORCED_SHUTDOWN(target->bt_mount)) + xfs_buf_ioerror_alert(bp, fa); - if (flags & XBF_ASYNC) { - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); xfs_buf_relse(bp); - return NULL; + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; } - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - ASSERT(bp->b_ops != NULL || ops == NULL); - return bp; + *bpp = bp; + return 0; } /* @@ -859,11 +880,14 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { + struct xfs_buf *bp; + if (bdi_read_congested(target->bt_bdev->bd_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); + XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, + __this_address); } /* @@ -880,12 +904,13 @@ xfs_buf_read_uncached( const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + int error; *bpp = NULL; - bp = xfs_buf_get_uncached(target, numblks, flags); - if (!bp) - return -ENOMEM; + error = xfs_buf_get_uncached(target, numblks, flags, &bp); + if (error) + return error; /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); @@ -896,7 +921,7 @@ xfs_buf_read_uncached( xfs_buf_submit(bp); if (bp->b_error) { - int error = bp->b_error; + error = bp->b_error; xfs_buf_relse(bp); return error; } @@ -905,20 +930,23 @@ xfs_buf_read_uncached( return 0; } -xfs_buf_t * +int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - int flags) + int flags, + struct xfs_buf **bpp) { unsigned long page_count; int error, i; struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); + *bpp = NULL; + /* flags might contain irrelevant bits, pass only what we care about */ - bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); - if (unlikely(bp == NULL)) + error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); + if (error) goto fail; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; @@ -928,8 +956,10 @@ xfs_buf_get_uncached( for (i = 0; i < page_count; i++) { bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); - if (!bp->b_pages[i]) + if (!bp->b_pages[i]) { + error = -ENOMEM; goto fail_free_mem; + } } bp->b_flags |= _XBF_PAGES; @@ -941,7 +971,8 @@ xfs_buf_get_uncached( } trace_xfs_buf_get_uncached(bp, _RET_IP_); - return bp; + *bpp = bp; + return 0; fail_free_mem: while (--i >= 0) @@ -951,7 +982,7 @@ xfs_buf_get_uncached( xfs_buf_free_maps(bp); kmem_cache_free(xfs_buf_zone, bp); fail: - return NULL; + return error; } /* @@ -1205,10 +1236,10 @@ __xfs_buf_ioerror( void xfs_buf_ioerror_alert( struct xfs_buf *bp, - const char *func) + xfs_failaddr_t func) { xfs_alert(bp->b_mount, -"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", +"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 56e081dd1d96..d79a1fe5d738 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -192,37 +192,40 @@ struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, - const struct xfs_buf_ops *ops); +int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); +int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops, xfs_failaddr_t fa); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, const struct xfs_buf_ops *ops); -static inline struct xfs_buf * +static inline int xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_get_map(target, &map, 1, 0); + + return xfs_buf_get_map(target, &map, 1, 0, bpp); } -static inline struct xfs_buf * +static inline int xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags, ops); + + return xfs_buf_read_map(target, &map, 1, flags, bpp, ops, + __builtin_return_address(0)); } static inline void @@ -236,8 +239,8 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, - int flags); +int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, + struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); @@ -259,7 +262,7 @@ extern void xfs_buf_ioend(struct xfs_buf *bp); extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) -extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); +extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); extern int __xfs_buf_submit(struct xfs_buf *bp, bool); static inline int xfs_buf_submit(struct xfs_buf *bp) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 5be8973a452c..663810e6cd59 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1113,7 +1113,7 @@ xfs_buf_iodone_callback_error( if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); } lasttarg = bp->b_target; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index cae613620175..0b8350e84d28 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -45,7 +45,7 @@ xfs_trim_extents( xfs_log_force(mp, XFS_LOG_SYNC); error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); - if (error || !agbp) + if (error) goto out_put_perag; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9cfd3209f52b..d223e1ae90a6 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -320,10 +320,10 @@ xfs_dquot_disk_alloc( dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; bp->b_ops = &xfs_dquot_buf_ops; /* diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 5f12b5d8527a..1a88025e68a3 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -159,16 +159,15 @@ xfs_filestream_pick_ag( if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); - if (err && !trylock) { + if (err) { xfs_perag_put(pag); - return err; + if (err != -EAGAIN) + return err; + /* Couldn't lock the AGF, skip this AG. */ + continue; } } - /* Might fail sometimes during the 1st pass with trylock set. */ - if (!pag->pagf_init) - goto next_ag; - /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1979a0055763..c5077e6326c7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2546,6 +2546,7 @@ xfs_ifree_cluster( struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_ino_t inum; + int error; inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); @@ -2574,12 +2575,11 @@ xfs_ifree_cluster( * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * igeo->blocks_per_cluster, - XBF_UNMAPPED); - - if (!bp) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * igeo->blocks_per_cluster, + XBF_UNMAPPED, &bp); + if (error) + return error; /* * This buffer may not have been correctly initialised as we diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0d683fb96396..25cfc85dbaa7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -294,7 +294,7 @@ xlog_recover_iodone( * this during recovery. One strike! */ if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); } } @@ -2745,15 +2745,10 @@ xlog_recover_buffer_pass2( if (buf_f->blf_flags & XFS_BLF_INODE_BUF) buf_flags |= XBF_UNMAPPED; - bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, NULL); - if (!bp) - return -ENOMEM; - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); - goto out_release; - } + error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, &bp, NULL); + if (error) + return error; /* * Recover the buffer only if we get an LSN from it and it's less than @@ -2950,17 +2945,10 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - &xfs_inode_buf_ops); - if (!bp) { - error = -ENOMEM; + error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + 0, &bp, &xfs_inode_buf_ops); + if (error) goto error; - } - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); - goto out_release; - } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = xfs_buf_offset(bp, in_f->ilf_boffset); @@ -5639,7 +5627,7 @@ xlog_do_recover( error = xfs_buf_submit(bp); if (error) { if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); ASSERT(0); } xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e723b267a247..b0ce04ffd3cd 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -143,8 +143,6 @@ xfs_reflink_find_shared( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; - if (!agbp) - return -ENOMEM; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index d42b5a2047e0..6209e7b6b895 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -826,12 +826,10 @@ xfs_growfs_rt_alloc( * Get a buffer for the block. */ d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - if (bp == NULL) { - error = -EIO; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0, &bp); + if (error) goto out_trans_cancel; - } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index a25502bc2071..d762d42ed0ff 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -53,20 +53,10 @@ xfs_readlink_bmap_ilocked( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, - &xfs_symlink_buf_ops); - if (!bp) - return -ENOMEM; - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; - goto out; - } + error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &bp, &xfs_symlink_buf_ops); + if (error) + return error; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); if (pathlen < byte_cnt) byte_cnt = pathlen; @@ -290,12 +280,10 @@ xfs_symlink( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - if (!bp) { - error = -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0, &bp); + if (error) goto out_trans_cancel; - } bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); @@ -433,13 +421,12 @@ xfs_inactive_symlink_rmt( * Invalidate the block(s). No validation is done. */ for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - if (!bp) { - error = -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, + &bp); + if (error) goto error_trans_cancel; - } xfs_trans_binval(tp, bp); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 64d7f171ebd3..752c7fef9de7 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -169,21 +169,21 @@ int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); -struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, - struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - uint flags); +int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, + struct xfs_buf **bpp); -static inline struct xfs_buf * +static inline int xfs_trans_get_buf( struct xfs_trans *tp, struct xfs_buftarg *target, xfs_daddr_t blkno, int numblks, - uint flags) + uint flags, + struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_trans_get_buf_map(tp, target, &map, 1, flags); + return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp); } int xfs_trans_read_buf_map(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index b5b3a78ef31c..08174ffa2118 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -112,19 +112,22 @@ xfs_trans_bjoin( * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ -struct xfs_buf * +int xfs_trans_get_buf_map( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { xfs_buf_t *bp; struct xfs_buf_log_item *bip; + int error; + *bpp = NULL; if (!tp) - return xfs_buf_get_map(target, map, nmaps, flags); + return xfs_buf_get_map(target, map, nmaps, flags, bpp); /* * If we find the buffer in the cache with this transaction @@ -146,19 +149,20 @@ xfs_trans_get_buf_map( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_get_buf_recur(bip); - return bp; + *bpp = bp; + return 0; } - bp = xfs_buf_get_map(target, map, nmaps, flags); - if (bp == NULL) { - return NULL; - } + error = xfs_buf_get_map(target, map, nmaps, flags, &bp); + if (error) + return error; ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_log_item); - return bp; + *bpp = bp; + return 0; } /* @@ -276,7 +280,7 @@ xfs_trans_read_buf_map( ASSERT(bp->b_ops != NULL); error = xfs_buf_reverify(bp, ops); if (error) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __return_address); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, @@ -298,36 +302,17 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags, ops); - if (!bp) { - if (!(flags & XBF_TRYLOCK)) - return -ENOMEM; - return tp ? 0 : -EAGAIN; - } - - /* - * If we've had a read error, then the contents of the buffer are - * invalid and should not be used. To ensure that a followup read tries - * to pull the buffer from disk again, we clear the XBF_DONE flag and - * mark the buffer stale. This ensures that anyone who has a current - * reference to the buffer will interpret it's contents correctly and - * future cache lookups will also treat it as an empty, uninitialised - * buffer. - */ - if (bp->b_error) { - error = bp->b_error; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_buf_ioerror_alert(bp, __func__); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - + error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops, + __return_address); + switch (error) { + case 0: + break; + default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; + /* fall through */ + case -ENOMEM: + case -EAGAIN: return error; } |