summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/cmservice.c6
-rw-r--r--fs/afs/dir_edit.c12
-rw-r--r--fs/afs/file.c6
-rw-r--r--fs/afs/fsclient.c16
-rw-r--r--fs/afs/internal.h16
-rw-r--r--fs/afs/rxrpc.c12
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/afs/vl_list.c4
-rw-r--r--fs/afs/vlclient.c6
-rw-r--r--fs/afs/xattr.c16
-rw-r--r--fs/afs/yfsclient.c11
-rw-r--r--fs/aio.c2
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_elf_fdpic.c12
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/buffer.c54
-rw-r--r--fs/ceph/dir.c1
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/cifs/cifs_debug.c43
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsfs.c44
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h90
-rw-r--r--fs/cifs/cifsproto.h8
-rw-r--r--fs/cifs/connect.c191
-rw-r--r--fs/cifs/dfs_cache.c3
-rw-r--r--fs/cifs/dir.c6
-rw-r--r--fs/cifs/file.c159
-rw-r--r--fs/cifs/inode.c333
-rw-r--r--fs/cifs/misc.c17
-rw-r--r--fs/cifs/sess.c230
-rw-r--r--fs/cifs/smb1ops.c8
-rw-r--r--fs/cifs/smb2misc.c175
-rw-r--r--fs/cifs/smb2ops.c141
-rw-r--r--fs/cifs/smb2pdu.c168
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/smb2transport.c165
-rw-r--r--fs/cifs/smbdirect.c36
-rw-r--r--fs/cifs/transport.c37
-rw-r--r--fs/compat_binfmt_elf.c4
-rw-r--r--fs/compat_ioctl.c917
-rw-r--r--fs/dax.c13
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/file.c87
-rw-r--r--fs/direct-io.c21
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/erofs/Kconfig1
-rw-r--r--fs/erofs/decompressor.c2
-rw-r--r--fs/erofs/erofs_fs.h3
-rw-r--r--fs/erofs/internal.h7
-rw-r--r--fs/erofs/super.c39
-rw-r--r--fs/erofs/utils.c17
-rw-r--r--fs/erofs/zdata.c288
-rw-r--r--fs/erofs/zdata.h8
-rw-r--r--fs/erofs/zmap.c28
-rw-r--r--fs/exec.c3
-rw-r--r--fs/ext2/balloc.c75
-rw-r--r--fs/ext2/ext2.h12
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext2/ioctl.c5
-rw-r--r--fs/ext2/super.c13
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/ext4_jbd2.c32
-rw-r--r--fs/ext4/ext4_jbd2.h106
-rw-r--r--fs/ext4/extents.c149
-rw-r--r--fs/ext4/file.c412
-rw-r--r--fs/ext4/fsync.c72
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/indirect.c125
-rw-r--r--fs/ext4/inode.c926
-rw-r--r--fs/ext4/ioctl.c1
-rw-r--r--fs/ext4/migrate.c103
-rw-r--r--fs/ext4/namei.c50
-rw-r--r--fs/ext4/page-io.c167
-rw-r--r--fs/ext4/readpage.c6
-rw-r--r--fs/ext4/resize.c46
-rw-r--r--fs/ext4/super.c59
-rw-r--r--fs/ext4/xattr.c94
-rw-r--r--fs/f2fs/checkpoint.c2
-rw-r--r--fs/f2fs/data.c190
-rw-r--r--fs/f2fs/dir.c7
-rw-r--r--fs/f2fs/f2fs.h63
-rw-r--r--fs/f2fs/file.c48
-rw-r--r--fs/f2fs/gc.c46
-rw-r--r--fs/f2fs/inode.c8
-rw-r--r--fs/f2fs/namei.c15
-rw-r--r--fs/f2fs/node.c3
-rw-r--r--fs/f2fs/recovery.c2
-rw-r--r--fs/f2fs/segment.c64
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c52
-rw-r--r--fs/f2fs/sysfs.c4
-rw-r--r--fs/f2fs/xattr.c14
-rw-r--r--fs/fat/file.c13
-rw-r--r--fs/file.c2
-rw-r--r--fs/fuse/dev.c33
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/file.c36
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/file.c1
-rw-r--r--fs/hugetlbfs/inode.c63
-rw-r--r--fs/io-wq.c187
-rw-r--r--fs/io-wq.h63
-rw-r--r--fs/io_uring.c834
-rw-r--r--fs/ioctl.c80
-rw-r--r--fs/iomap/Makefile16
-rw-r--r--fs/iomap/apply.c32
-rw-r--r--fs/iomap/buffered-io.c756
-rw-r--r--fs/iomap/direct-io.c24
-rw-r--r--fs/iomap/fiemap.c10
-rw-r--r--fs/iomap/seek.c4
-rw-r--r--fs/iomap/swapfile.c3
-rw-r--r--fs/iomap/trace.c12
-rw-r--r--fs/iomap/trace.h191
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c26
-rw-r--r--fs/jbd2/journal.c65
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/jbd2/transaction.c400
-rw-r--r--fs/kernfs/dir.c4
-rw-r--r--fs/namei.c8
-rw-r--r--fs/nilfs2/ioctl.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/fdinfo.c2
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/notify/fsnotify.h2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/alloc.c32
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/ioctl.c1
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/suballoc.c19
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/open.c6
-rw-r--r--fs/pipe.c232
-rw-r--r--fs/proc/stat.c56
-rw-r--r--fs/quota/dquot.c289
-rw-r--r--fs/quota/quota.c7
-rw-r--r--fs/quota/quota_v1.c1
-rw-r--r--fs/reiserfs/file.c10
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/namei.c7
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c19
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/select.c10
-rw-r--r--fs/splice.c199
-rw-r--r--fs/timerfd.c14
-rw-r--r--fs/userfaultfd.c23
-rw-r--r--fs/utimes.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c14
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/xfs_aops.c754
-rw-r--r--fs/xfs/xfs_aops.h17
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_iomap.c51
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_super.c11
-rw-r--r--fs/xfs/xfs_trace.h65
166 files changed, 6195 insertions, 5111 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index b86195e4dc6c..ff3994a6be23 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -342,14 +342,14 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (call->count2 != call->count && call->count2 != 0)
return afs_protocol_error(call, -EBADMSG,
afs_eproto_cb_count);
- call->_iter = &call->iter;
- iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4);
+ call->iter = &call->def_iter;
+ iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
call->unmarshall++;
/* Fall through */
case 4:
_debug("extract discard %zu/%u",
- iov_iter_count(&call->iter), call->count2 * 3 * 4);
+ iov_iter_count(call->iter), call->count2 * 3 * 4);
ret = afs_extract_data(call, false);
if (ret < 0)
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index d4fbe5f85f1b..b108528bf010 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -68,13 +68,11 @@ static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_
static void afs_set_contig_bits(union afs_xdr_dir_block *block,
int bit, unsigned int nr_slots)
{
- u64 mask, before, after;
+ u64 mask;
mask = (1 << nr_slots) - 1;
mask <<= bit;
- before = *(u64 *)block->hdr.bitmap;
-
block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8);
block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8);
block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8);
@@ -83,8 +81,6 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block,
block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8);
block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8);
block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8);
-
- after = *(u64 *)block->hdr.bitmap;
}
/*
@@ -93,13 +89,11 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block,
static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
int bit, unsigned int nr_slots)
{
- u64 mask, before, after;
+ u64 mask;
mask = (1 << nr_slots) - 1;
mask <<= bit;
- before = *(u64 *)block->hdr.bitmap;
-
block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8);
block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8);
block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8);
@@ -108,8 +102,6 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8);
block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8);
block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8);
-
- after = *(u64 *)block->hdr.bitmap;
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index dd3c55c9101c..8415733f7bc1 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -223,7 +223,7 @@ static void afs_file_readpage_read_complete(struct page *page,
/*
* Fetch file data from the volume.
*/
-int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc)
+int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req)
{
struct afs_fs_cursor fc;
struct afs_status_cb *scb;
@@ -246,7 +246,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(vnode);
- afs_fs_fetch_data(&fc, scb, desc);
+ afs_fs_fetch_data(&fc, scb, req);
}
afs_check_for_remote_deletion(&fc, vnode);
@@ -257,7 +257,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
if (ret == 0) {
afs_stat_v(vnode, n_fetches);
- atomic_long_add(desc->actual_len,
+ atomic_long_add(req->actual_len,
&afs_v2net(vnode)->n_fetch_bytes);
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 6f84231f11a5..1f9c5d8e6fe5 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -323,7 +323,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
int ret;
_enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
+ call->unmarshall, iov_iter_count(call->iter), req->actual_len);
switch (call->unmarshall) {
case 0:
@@ -363,14 +363,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->bvec[0].bv_len = size;
call->bvec[0].bv_offset = req->offset;
call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
/* Fall through */
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(&call->iter), req->remain);
+ iov_iter_count(call->iter), req->remain);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -398,7 +398,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
case 3:
_debug("extract discard %zu/%llu",
- iov_iter_count(&call->iter), req->actual_len - req->len);
+ iov_iter_count(call->iter), req->actual_len - req->len);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -490,7 +490,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc,
call->key = fc->key;
call->out_scb = scb;
call->out_volsync = NULL;
- call->read_request = req;
+ call->read_request = afs_get_read(req);
/* marshall the parameters */
bp = call->request;
@@ -503,7 +503,6 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc,
bp[6] = 0;
bp[7] = htonl(lower_32_bits(req->len));
- refcount_inc(&req->usage);
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
afs_set_fc_call(call, fc);
@@ -540,7 +539,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc,
call->key = fc->key;
call->out_scb = scb;
call->out_volsync = NULL;
- call->read_request = req;
+ call->read_request = afs_get_read(req);
/* marshall the parameters */
bp = call->request;
@@ -551,7 +550,6 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc,
bp[4] = htonl(lower_32_bits(req->pos));
bp[5] = htonl(lower_32_bits(req->len));
- refcount_inc(&req->usage);
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
afs_set_fc_call(call, fc);
@@ -1852,7 +1850,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
u32 count;
int ret;
- _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter));
+ _enter("{%u,%zu}", call->unmarshall, iov_iter_count(call->iter));
switch (call->unmarshall) {
case 0:
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 759e0578012c..1d81fc4c3058 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -115,9 +115,9 @@ struct afs_call {
struct afs_vnode *lvnode; /* vnode being locked */
void *request; /* request data (first part) */
struct address_space *mapping; /* Pages being written from */
- struct iov_iter iter; /* Buffer iterator */
- struct iov_iter *_iter; /* Iterator currently in use */
- union { /* Convenience for ->iter */
+ struct iov_iter def_iter; /* Default buffer/data iterator */
+ struct iov_iter *iter; /* Iterator currently in use */
+ union { /* Convenience for ->def_iter */
struct kvec kvec[1];
struct bio_vec bvec[1];
};
@@ -934,6 +934,12 @@ extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *);
extern int afs_page_filler(void *, struct page *);
extern void afs_put_read(struct afs_read *);
+static inline struct afs_read *afs_get_read(struct afs_read *req)
+{
+ refcount_inc(&req->usage);
+ return req;
+}
+
/*
* flock.c
*/
@@ -1136,7 +1142,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
{
call->kvec[0].iov_base = buf;
call->kvec[0].iov_len = size;
- iov_iter_kvec(&call->iter, READ, call->kvec, 1, size);
+ iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
}
static inline void afs_extract_to_tmp(struct afs_call *call)
@@ -1151,7 +1157,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call)
static inline void afs_extract_discard(struct afs_call *call, size_t size)
{
- iov_iter_discard(&call->iter, READ, size);
+ iov_iter_discard(&call->def_iter, READ, size);
}
static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 61498d9f06ef..58d396592250 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -152,7 +152,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
- call->_iter = &call->iter;
+ call->iter = &call->def_iter;
o = atomic_inc_return(&net->nr_outstanding_calls);
trace_afs_call(call, afs_call_trace_alloc, 1, o,
@@ -513,12 +513,12 @@ static void afs_deliver_to_call(struct afs_call *call)
state == AFS_CALL_SV_AWAIT_ACK
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
- iov_iter_kvec(&call->iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
- call->rxcall, &call->iter,
+ call->rxcall, &call->def_iter,
false, &remote_abort,
&call->service_id);
- trace_afs_receive_data(call, &call->iter, false, ret);
+ trace_afs_receive_data(call, &call->def_iter, false, ret);
if (ret == -EINPROGRESS || ret == -EAGAIN)
return;
@@ -859,7 +859,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
{
int ret;
- _enter("{%zu}", iov_iter_count(call->_iter));
+ _enter("{%zu}", iov_iter_count(call->iter));
/* the operation ID forms the first four bytes of the request data */
ret = afs_extract_data(call, true);
@@ -975,7 +975,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
int afs_extract_data(struct afs_call *call, bool want_more)
{
struct afs_net *net = call->net;
- struct iov_iter *iter = call->_iter;
+ struct iov_iter *iter = call->iter;
enum afs_call_state state;
u32 remote_abort = 0;
int ret;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 64d440aaabc0..1686bf188ccd 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -151,7 +151,7 @@ static struct afs_server *afs_install_server(struct afs_net *net,
const struct afs_addr_list *alist;
struct afs_server *server;
struct rb_node **pp, *p;
- int ret = -EEXIST, diff;
+ int diff;
_enter("%p", candidate);
@@ -196,7 +196,6 @@ static struct afs_server *afs_install_server(struct afs_net *net,
hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
write_sequnlock(&net->fs_addr_lock);
- ret = 0;
exists:
afs_get_server(server, afs_server_trace_get_install);
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 21eb0c0be912..8fea54eba0c2 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -279,8 +279,8 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
struct afs_addr_list *old = addrs;
write_lock(&server->lock);
- rcu_swap_protected(server->addresses, old,
- lockdep_is_held(&server->lock));
+ old = rcu_replace_pointer(server->addresses, old,
+ lockdep_is_held(&server->lock));
write_unlock(&server->lock);
afs_put_addrlist(old);
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index cfb0ac4bd039..516e9a3bb5b4 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -185,7 +185,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
int i, ret;
_enter("{%u,%zu/%u}",
- call->unmarshall, iov_iter_count(call->_iter), call->count);
+ call->unmarshall, iov_iter_count(call->iter), call->count);
switch (call->unmarshall) {
case 0:
@@ -316,7 +316,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
int ret;
_enter("{%u,%zu/%u}",
- call->unmarshall, iov_iter_count(call->_iter), call->count);
+ call->unmarshall, iov_iter_count(call->iter), call->count);
switch (call->unmarshall) {
case 0:
@@ -425,7 +425,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
int ret;
_enter("{%u,%zu,%u}",
- call->unmarshall, iov_iter_count(call->_iter), call->count2);
+ call->unmarshall, iov_iter_count(call->iter), call->count2);
switch (call->unmarshall) {
case 0:
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 5552d034090a..7af41fd5f3ee 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -228,11 +228,11 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler,
break;
case 1:
data = buf;
- dsize = snprintf(buf, sizeof(buf), "%u", yacl->inherit_flag);
+ dsize = scnprintf(buf, sizeof(buf), "%u", yacl->inherit_flag);
break;
case 2:
data = buf;
- dsize = snprintf(buf, sizeof(buf), "%u", yacl->num_cleaned);
+ dsize = scnprintf(buf, sizeof(buf), "%u", yacl->num_cleaned);
break;
case 3:
data = yacl->vol_acl->data;
@@ -370,13 +370,15 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler,
/* The volume ID is 64-bit, the vnode ID is 96-bit and the
* uniquifier is 32-bit.
*/
- len = sprintf(text, "%llx:", vnode->fid.vid);
+ len = scnprintf(text, sizeof(text), "%llx:", vnode->fid.vid);
if (vnode->fid.vnode_hi)
- len += sprintf(text + len, "%x%016llx",
- vnode->fid.vnode_hi, vnode->fid.vnode);
+ len += scnprintf(text + len, sizeof(text) - len, "%x%016llx",
+ vnode->fid.vnode_hi, vnode->fid.vnode);
else
- len += sprintf(text + len, "%llx", vnode->fid.vnode);
- len += sprintf(text + len, ":%x", vnode->fid.unique);
+ len += scnprintf(text + len, sizeof(text) - len, "%llx",
+ vnode->fid.vnode);
+ len += scnprintf(text + len, sizeof(text) - len, ":%x",
+ vnode->fid.unique);
if (size == 0)
return len;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 9ac035c17dc4..a26126ac7bf1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -441,7 +441,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
int ret;
_enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
+ call->unmarshall, iov_iter_count(call->iter), req->actual_len);
switch (call->unmarshall) {
case 0:
@@ -476,14 +476,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
call->bvec[0].bv_len = size;
call->bvec[0].bv_offset = req->offset;
call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
/* Fall through */
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(&call->iter), req->remain);
+ iov_iter_count(call->iter), req->remain);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -511,7 +511,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
case 3:
_debug("extract discard %zu/%llu",
- iov_iter_count(&call->iter), req->actual_len - req->len);
+ iov_iter_count(call->iter), req->actual_len - req->len);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -605,7 +605,7 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
call->key = fc->key;
call->out_scb = scb;
call->out_volsync = NULL;
- call->read_request = req;
+ call->read_request = afs_get_read(req);
/* marshall the parameters */
bp = call->request;
@@ -616,7 +616,6 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
bp = xdr_encode_u64(bp, req->len);
yfs_check_req(call, bp);
- refcount_inc(&req->usage);
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
afs_set_fc_call(call, fc);
diff --git a/fs/aio.c b/fs/aio.c
index 0d9a559d488c..a9fbad2ce5e6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2056,7 +2056,7 @@ static long do_io_getevents(aio_context_t ctx_id,
* specifies an infinite timeout. Note that the timeout pointed to by
* timeout is relative. Will fail with -ENOSYS if not implemented.
*/
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
long, min_nr,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c5642bcb6b46..5372eabd276a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1489,18 +1489,18 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
* group-wide total, not its individual thread total.
*/
thread_group_cputime(p, &cputime);
- prstatus->pr_utime = ns_to_timeval(cputime.utime);
- prstatus->pr_stime = ns_to_timeval(cputime.stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime);
} else {
u64 utime, stime;
task_cputime(p, &utime, &stime);
- prstatus->pr_utime = ns_to_timeval(utime);
- prstatus->pr_stime = ns_to_timeval(stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(stime);
}
- prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
- prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
+ prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime);
+ prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime);
}
static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d86ebd0dcc3d..240f66663543 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1359,17 +1359,17 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
* group-wide total, not its individual thread total.
*/
thread_group_cputime(p, &cputime);
- prstatus->pr_utime = ns_to_timeval(cputime.utime);
- prstatus->pr_stime = ns_to_timeval(cputime.stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime);
} else {
u64 utime, stime;
task_cputime(p, &utime, &stime);
- prstatus->pr_utime = ns_to_timeval(utime);
- prstatus->pr_stime = ns_to_timeval(stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(stime);
}
- prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
- prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
+ prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime);
+ prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime);
prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a98c3c71fc54..f452a94abdc3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2299,7 +2299,7 @@ static const struct super_operations btrfs_super_ops = {
static const struct file_operations btrfs_ctl_fops = {
.open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
- .compat_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b979323..d8c7242426bb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -47,6 +47,9 @@
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
+#include <linux/fscrypt.h>
+
+#include "internal.h"
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
@@ -246,10 +249,6 @@ out:
return ret;
}
-/*
- * I/O completion handler for block_read_full_page() - pages
- * which come unlocked at the end of I/O.
- */
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
@@ -307,6 +306,47 @@ still_busy:
return;
}
+struct decrypt_bh_ctx {
+ struct work_struct work;
+ struct buffer_head *bh;
+};
+
+static void decrypt_bh(struct work_struct *work)
+{
+ struct decrypt_bh_ctx *ctx =
+ container_of(work, struct decrypt_bh_ctx, work);
+ struct buffer_head *bh = ctx->bh;
+ int err;
+
+ err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
+ bh_offset(bh));
+ end_buffer_async_read(bh, err == 0);
+ kfree(ctx);
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
+{
+ /* Decrypt if needed */
+ if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) &&
+ IS_ENCRYPTED(bh->b_page->mapping->host) &&
+ S_ISREG(bh->b_page->mapping->host->i_mode)) {
+ struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
+
+ if (ctx) {
+ INIT_WORK(&ctx->work, decrypt_bh);
+ ctx->bh = bh;
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ uptodate = 0;
+ }
+ end_buffer_async_read(bh, uptodate);
+}
+
/*
* Completion handler for block_write_full_page() - pages which are unlocked
* during I/O, and which have PageWriteback cleared upon I/O completion.
@@ -379,7 +419,7 @@ EXPORT_SYMBOL(end_buffer_async_write);
*/
static void mark_buffer_async_read(struct buffer_head *bh)
{
- bh->b_end_io = end_buffer_async_read;
+ bh->b_end_io = end_buffer_async_read_io;
set_buffer_async_read(bh);
}
@@ -1385,10 +1425,10 @@ static bool has_bh_in_lru(int cpu, void *dummy)
for (i = 0; i < BH_LRU_SIZE; i++) {
if (b->bhs[i])
- return 1;
+ return true;
}
- return 0;
+ return false;
}
void invalidate_bh_lrus(void)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d17a789fd856..2e4764fd1872 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1809,6 +1809,7 @@ const struct file_operations ceph_dir_fops = {
.open = ceph_open,
.release = ceph_release,
.unlocked_ioctl = ceph_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.fsync = ceph_fsync,
.lock = ceph_lock,
.flock = ceph_flock,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8de633964dc3..11929d2bb594 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2188,7 +2188,7 @@ const struct file_operations ceph_file_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
- .compat_ioctl = ceph_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.fallocate = ceph_fallocate,
.copy_file_range = ceph_copy_file_range,
};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 0b4eee3bed66..19f6e592b941 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -122,6 +122,27 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
}
static void
+cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
+{
+ struct TCP_Server_Info *server = chan->server;
+
+ seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x "
+ "TCP status: %d Instance: %d Local Users To Server: %d "
+ "SecMode: 0x%x Req On Wire: %d In Send: %d "
+ "In MaxReq Wait: %d\n",
+ i+1,
+ server->credits,
+ server->dialect,
+ server->tcpStatus,
+ server->reconnect_instance,
+ server->srv_count,
+ server->sec_mode,
+ in_flight(server),
+ atomic_read(&server->in_send),
+ atomic_read(&server->num_waiters));
+}
+
+static void
cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
{
struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
@@ -256,6 +277,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
if (!server->rdma)
goto skip_rdma;
+ if (!server->smbd_conn) {
+ seq_printf(m, "\nSMBDirect transport not available");
+ goto skip_rdma;
+ }
+
seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
"transport status: %x",
server->smbd_conn->protocol,
@@ -360,11 +386,10 @@ skip_rdma:
server->srv_count,
server->sec_mode, in_flight(server));
-#ifdef CONFIG_CIFS_STATS2
seq_printf(m, " In Send: %d In MaxReq Wait: %d",
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
-#endif
+
/* dump session id helpful for use with network trace */
seq_printf(m, " SessionId: 0x%llx", ses->Suid);
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
@@ -372,6 +397,13 @@ skip_rdma:
if (ses->sign)
seq_puts(m, " signed");
+ if (ses->chan_count > 1) {
+ seq_printf(m, "\n\n\tExtra Channels: %zu\n",
+ ses->chan_count-1);
+ for (j = 1; j < ses->chan_count; j++)
+ cifs_dump_channel(m, j, &ses->chans[j]);
+ }
+
seq_puts(m, "\n\tShares:");
j = 0;
@@ -410,8 +442,13 @@ skip_rdma:
seq_printf(m, "\n\tServer interfaces: %zu\n",
ses->iface_count);
for (j = 0; j < ses->iface_count; j++) {
+ struct cifs_server_iface *iface;
+
+ iface = &ses->iface_list[j];
seq_printf(m, "\t%d)", j);
- cifs_dump_iface(m, &ses->iface_list[j]);
+ cifs_dump_iface(m, iface);
+ if (is_ses_using_iface(ses, iface))
+ seq_puts(m, "\t\t[CONNECTED]\n");
}
spin_unlock(&ses->iface_lock);
}
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7f01c6e60791..7b9b876b513b 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,7 +98,7 @@ struct key_type cifs_spnego_key_type = {
struct key *
cifs_get_spnego_key(struct cifs_ses *sesInfo)
{
- struct TCP_Server_Info *server = sesInfo->server;
+ struct TCP_Server_Info *server = cifs_ses_server(sesInfo);
struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
char *description, *dp;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index f842944a5c76..06ffe52bdcfa 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -39,8 +39,6 @@ static const struct cifs_sid sid_everyone = {
/* security id for Authenticated Users system group */
static const struct cifs_sid sid_authusers = {
1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
-/* group users */
-static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
/* S-1-22-1 Unmapped Unix users */
static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22},
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1a135d1b85bd..5d3e63aff253 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
struct workqueue_struct *decrypt_wq;
+struct workqueue_struct *fileinfo_put_wq;
struct workqueue_struct *cifsoplockd_wq;
__u32 cifs_lock_secret;
@@ -613,6 +614,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
/* convert actimeo and display it in seconds */
seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
+ if (tcon->ses->chan_max > 1)
+ seq_printf(s, ",multichannel,max_channel=%zu",
+ tcon->ses->chan_max);
+
return 0;
}
@@ -1219,6 +1224,7 @@ const struct file_operations cifs_file_ops = {
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
+ .flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
@@ -1238,6 +1244,7 @@ const struct file_operations cifs_file_strict_ops = {
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
+ .flock = cifs_flock,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
@@ -1257,6 +1264,7 @@ const struct file_operations cifs_file_direct_ops = {
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
+ .flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
@@ -1554,11 +1562,18 @@ init_cifs(void)
goto out_destroy_cifsiod_wq;
}
+ fileinfo_put_wq = alloc_workqueue("cifsfileinfoput",
+ WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!fileinfo_put_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_decrypt_wq;
+ }
+
cifsoplockd_wq = alloc_workqueue("cifsoplockd",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!cifsoplockd_wq) {
rc = -ENOMEM;
- goto out_destroy_decrypt_wq;
+ goto out_destroy_fileinfo_put_wq;
}
rc = cifs_fscache_register();
@@ -1624,6 +1639,8 @@ out_unreg_fscache:
cifs_fscache_unregister();
out_destroy_cifsoplockd_wq:
destroy_workqueue(cifsoplockd_wq);
+out_destroy_fileinfo_put_wq:
+ destroy_workqueue(fileinfo_put_wq);
out_destroy_decrypt_wq:
destroy_workqueue(decrypt_wq);
out_destroy_cifsiod_wq:
@@ -1653,6 +1670,7 @@ exit_cifs(void)
cifs_fscache_unregister();
destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(decrypt_wq);
+ destroy_workqueue(fileinfo_put_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
@@ -1663,17 +1681,17 @@ MODULE_DESCRIPTION
("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
"also older servers complying with the SNIA CIFS Specification)");
MODULE_VERSION(CIFS_VERSION);
-MODULE_SOFTDEP("pre: ecb");
-MODULE_SOFTDEP("pre: hmac");
-MODULE_SOFTDEP("pre: md4");
-MODULE_SOFTDEP("pre: md5");
-MODULE_SOFTDEP("pre: nls");
-MODULE_SOFTDEP("pre: aes");
-MODULE_SOFTDEP("pre: cmac");
-MODULE_SOFTDEP("pre: sha256");
-MODULE_SOFTDEP("pre: sha512");
-MODULE_SOFTDEP("pre: aead2");
-MODULE_SOFTDEP("pre: ccm");
-MODULE_SOFTDEP("pre: gcm");
+MODULE_SOFTDEP("ecb");
+MODULE_SOFTDEP("hmac");
+MODULE_SOFTDEP("md4");
+MODULE_SOFTDEP("md5");
+MODULE_SOFTDEP("nls");
+MODULE_SOFTDEP("aes");
+MODULE_SOFTDEP("cmac");
+MODULE_SOFTDEP("sha256");
+MODULE_SOFTDEP("sha512");
+MODULE_SOFTDEP("aead2");
+MODULE_SOFTDEP("ccm");
+MODULE_SOFTDEP("gcm");
module_init(init_cifs)
module_exit(exit_cifs)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index bc4ca94137f2..b59dc7478130 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -108,6 +108,7 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
+extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
@@ -152,5 +153,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.23"
+#define CIFS_VERSION "2.24"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d78bfcc19156..d34a4ed8c57d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -230,7 +230,8 @@ struct smb_version_operations {
bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
/* setup request: allocate mid, sign message */
struct mid_q_entry *(*setup_request)(struct cifs_ses *,
- struct smb_rqst *);
+ struct TCP_Server_Info *,
+ struct smb_rqst *);
/* setup async request: allocate mid, sign message */
struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *,
struct smb_rqst *);
@@ -268,8 +269,9 @@ struct smb_version_operations {
int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *);
- void (*downgrade_oplock)(struct TCP_Server_Info *,
- struct cifsInodeInfo *, bool);
+ void (*downgrade_oplock)(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache);
/* process transaction2 response */
bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
char *, int);
@@ -591,6 +593,10 @@ struct smb_vol {
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
bool rdma:1;
+ bool multichannel:1;
+ bool use_client_guid:1;
+ /* reuse existing guid for multichannel */
+ u8 client_guid[SMB2_CLIENT_GUID_SIZE];
unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
@@ -607,6 +613,7 @@ struct smb_vol {
__u64 snapshot_time; /* needed for timewarp tokens */
__u32 handle_timeout; /* persistent and durable handle timeout in ms */
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
+ unsigned int max_channels;
__u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
bool rootfs:1; /* if it's a SMB root file system */
};
@@ -736,12 +743,12 @@ struct TCP_Server_Info {
/* Total size of this PDU. Only valid from cifs_demultiplex_thread */
unsigned int pdu_size;
unsigned int total_read; /* total amount of data read in this pass */
+ atomic_t in_send; /* requests trying to send */
+ atomic_t num_waiters; /* blocked waiting to get in sendrecv */
#ifdef CONFIG_CIFS_FSCACHE
struct fscache_cookie *fscache; /* client index cache cookie */
#endif
#ifdef CONFIG_CIFS_STATS2
- atomic_t in_send; /* requests trying to send */
- atomic_t num_waiters; /* blocked waiting to get in sendrecv */
atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */
atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */
__u64 time_per_cmd[NUMBER_OF_SMB2_COMMANDS]; /* total time per cmd */
@@ -953,6 +960,11 @@ struct cifs_server_iface {
struct sockaddr_storage sockaddr;
};
+struct cifs_chan {
+ struct TCP_Server_Info *server;
+ __u8 signkey[SMB3_SIGN_KEY_SIZE];
+};
+
/*
* Session structure. One of these for each uid session with a particular host
*/
@@ -983,12 +995,15 @@ struct cifs_ses {
bool sign; /* is signing required? */
bool need_reconnect:1; /* connection reset, uid now invalid */
bool domainAuto:1;
+ bool binding:1; /* are we binding the session? */
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+ __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+
/*
* Network interfaces available on the server this session is
* connected to.
@@ -1002,8 +1017,37 @@ struct cifs_ses {
struct cifs_server_iface *iface_list;
size_t iface_count;
unsigned long iface_last_update; /* jiffies */
+
+#define CIFS_MAX_CHANNELS 16
+ struct cifs_chan chans[CIFS_MAX_CHANNELS];
+ size_t chan_count;
+ size_t chan_max;
+ atomic_t chan_seq; /* round robin state */
};
+/*
+ * When binding a new channel, we need to access the channel which isn't fully
+ * established yet (one past the established count)
+ */
+
+static inline
+struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses)
+{
+ if (ses->binding)
+ return &ses->chans[ses->chan_count];
+ else
+ return NULL;
+}
+
+static inline
+struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses)
+{
+ if (ses->binding)
+ return ses->chans[ses->chan_count].server;
+ else
+ return ses->server;
+}
+
static inline bool
cap_unix(struct cifs_ses *ses)
{
@@ -1260,11 +1304,14 @@ struct cifsFileInfo {
unsigned int f_flags;
bool invalidHandle:1; /* file closed via session abend */
bool oplock_break_cancelled:1;
+ unsigned int oplock_epoch; /* epoch from the lease break */
+ __u32 oplock_level; /* oplock/lease level from the lease break */
int count;
spinlock_t file_info_lock; /* protects four flag/count fields above */
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
struct work_struct oplock_break; /* work for oplock breaks */
+ struct work_struct put; /* work for the final part of _put */
};
struct cifs_io_parms {
@@ -1370,7 +1417,8 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
}
struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
-void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr);
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr,
+ bool offload);
void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
#define CIFS_CACHE_READ_FLG 1
@@ -1405,7 +1453,7 @@ struct cifsInodeInfo {
unsigned int epoch; /* used to track lease state changes */
#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
-#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+#define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */
#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */
#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */
#define CIFS_INO_LOCK (5) /* lock bit for synchronization */
@@ -1524,6 +1572,7 @@ struct mid_q_entry {
struct TCP_Server_Info *server; /* server corresponding to this mid */
__u64 mid; /* multiplex id */
__u16 credits; /* number of credits consumed by this mid */
+ __u16 credits_received; /* number of credits from the response */
__u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
@@ -1551,12 +1600,12 @@ struct close_cancelled_open {
struct cifs_fid fid;
struct cifs_tcon *tcon;
struct work_struct work;
+ __u64 mid;
+ __u16 cmd;
};
/* Make code in transport.c a little cleaner by moving
update of optional stats into function below */
-#ifdef CONFIG_CIFS_STATS2
-
static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
{
atomic_inc(&server->in_send);
@@ -1577,26 +1626,12 @@ static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
atomic_dec(&server->num_waiters);
}
+#ifdef CONFIG_CIFS_STATS2
static inline void cifs_save_when_sent(struct mid_q_entry *mid)
{
mid->when_sent = jiffies;
}
#else
-static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
-{
-}
-static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
-{
-}
-
-static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
-{
-}
-
-static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
-{
-}
-
static inline void cifs_save_when_sent(struct mid_q_entry *mid)
{
}
@@ -1907,6 +1942,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
extern struct workqueue_struct *decrypt_wq;
+extern struct workqueue_struct *fileinfo_put_wq;
extern struct workqueue_struct *cifsoplockd_wq;
extern __u32 cifs_lock_secret;
@@ -1937,4 +1973,10 @@ extern struct smb_version_values smb302_values;
#define ALT_SMB311_VERSION_STRING "3.11"
extern struct smb_version_operations smb311_operations;
extern struct smb_version_values smb311_values;
+
+static inline bool is_smb1_server(struct TCP_Server_Info *server)
+{
+ return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fe597d3d5208..1ed695336f62 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -109,6 +109,7 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
char *in_buf, int flags);
extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *,
+ struct TCP_Server_Info *,
struct smb_rqst *);
extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
struct smb_rqst *);
@@ -242,6 +243,7 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
struct tcon_link *tlink,
struct cifs_pending_open *open);
extern void cifs_del_pending_open(struct cifs_pending_open *open);
+extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb_vol *vol);
extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
int from_reconnect);
extern void cifs_put_tcon(struct cifs_tcon *tcon);
@@ -584,6 +586,12 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
unsigned int *len, unsigned int *offset);
+int cifs_try_adding_channels(struct cifs_ses *ses);
+int cifs_ses_add_channel(struct cifs_ses *ses,
+ struct cifs_server_iface *iface);
+bool is_server_using_iface(struct TCP_Server_Info *server,
+ struct cifs_server_iface *iface);
+bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ccaa8bad336f..86d1baedf21c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -97,6 +97,7 @@ enum {
Opt_persistent, Opt_nopersistent,
Opt_resilient, Opt_noresilient,
Opt_domainauto, Opt_rdma, Opt_modesid, Opt_rootfs,
+ Opt_multichannel, Opt_nomultichannel,
Opt_compress,
/* Mount options which take numeric value */
@@ -106,7 +107,7 @@ enum {
Opt_min_enc_offload,
Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
Opt_echo_interval, Opt_max_credits, Opt_handletimeout,
- Opt_snapshot,
+ Opt_snapshot, Opt_max_channels,
/* Mount options which take string value */
Opt_user, Opt_pass, Opt_ip,
@@ -199,6 +200,8 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_noresilient, "noresilienthandles"},
{ Opt_domainauto, "domainauto"},
{ Opt_rdma, "rdma"},
+ { Opt_multichannel, "multichannel" },
+ { Opt_nomultichannel, "nomultichannel" },
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -218,6 +221,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_echo_interval, "echo_interval=%s" },
{ Opt_max_credits, "max_credits=%s" },
{ Opt_snapshot, "snapshot=%s" },
+ { Opt_max_channels, "max_channels=%s" },
{ Opt_compress, "compress=%s" },
{ Opt_blank_user, "user=" },
@@ -387,7 +391,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
#ifdef CONFIG_CIFS_DFS_UPCALL
struct super_cb_data {
struct TCP_Server_Info *server;
- struct cifs_sb_info *cifs_sb;
+ struct super_block *sb;
};
/* These functions must be called with server->srv_mutex held */
@@ -398,25 +402,39 @@ static void super_cb(struct super_block *sb, void *arg)
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- if (d->cifs_sb)
+ if (d->sb)
return;
cifs_sb = CIFS_SB(sb);
tcon = cifs_sb_master_tcon(cifs_sb);
if (tcon->ses->server == d->server)
- d->cifs_sb = cifs_sb;
+ d->sb = sb;
}
-static inline struct cifs_sb_info *
-find_super_by_tcp(struct TCP_Server_Info *server)
+static struct super_block *get_tcp_super(struct TCP_Server_Info *server)
{
struct super_cb_data d = {
.server = server,
- .cifs_sb = NULL,
+ .sb = NULL,
};
iterate_supers_type(&cifs_fs_type, super_cb, &d);
- return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT);
+
+ if (unlikely(!d.sb))
+ return ERR_PTR(-ENOENT);
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(d.sb);
+ return d.sb;
+}
+
+static inline void put_tcp_super(struct super_block *sb)
+{
+ if (!IS_ERR_OR_NULL(sb))
+ cifs_sb_deactive(sb);
}
static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
@@ -480,6 +498,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
struct mid_q_entry *mid_entry;
struct list_head retry_list;
#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct super_block *sb = NULL;
struct cifs_sb_info *cifs_sb = NULL;
struct dfs_cache_tgt_list tgt_list = {0};
struct dfs_cache_tgt_iterator *tgt_it = NULL;
@@ -489,13 +508,15 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->nr_targets = 1;
#ifdef CONFIG_CIFS_DFS_UPCALL
spin_unlock(&GlobalMid_Lock);
- cifs_sb = find_super_by_tcp(server);
- if (IS_ERR(cifs_sb)) {
- rc = PTR_ERR(cifs_sb);
+ sb = get_tcp_super(server);
+ if (IS_ERR(sb)) {
+ rc = PTR_ERR(sb);
cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
__func__, rc);
- cifs_sb = NULL;
+ sb = NULL;
} else {
+ cifs_sb = CIFS_SB(sb);
+
rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it);
if (rc && (rc != -EOPNOTSUPP)) {
cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n",
@@ -512,6 +533,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* the demux thread will exit normally
next time through the loop */
spin_unlock(&GlobalMid_Lock);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_free_tgts(&tgt_list);
+ put_tcp_super(sb);
+#endif
return rc;
} else
server->tcpStatus = CifsNeedReconnect;
@@ -638,7 +663,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
__func__, rc);
}
dfs_cache_free_tgts(&tgt_list);
+
}
+
+ put_tcp_super(sb);
#endif
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
@@ -905,6 +933,20 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
spin_unlock(&GlobalMid_Lock);
}
+static unsigned int
+smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+ /*
+ * SMB1 does not use credits.
+ */
+ if (server->vals->header_preamble_size)
+ return 0;
+
+ return le16_to_cpu(shdr->CreditRequest);
+}
+
static void
handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
char *buf, int malformed)
@@ -912,6 +954,7 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
if (server->ops->check_trans2 &&
server->ops->check_trans2(mid, server, buf, malformed))
return;
+ mid->credits_received = smb2_get_credits_from_hdr(buf, server);
mid->resp_buf = buf;
mid->large_buf = server->large_buf;
/* Was previous buf put in mpx struct for multi-rsp? */
@@ -1222,12 +1265,6 @@ next_pdu:
for (i = 0; i < num_mids; i++) {
if (mids[i] != NULL) {
mids[i]->resp_buf_size = server->pdu_size;
- if ((mids[i]->mid_flags & MID_WAIT_CANCELLED) &&
- mids[i]->mid_state == MID_RESPONSE_RECEIVED &&
- server->ops->handle_cancelled_mid)
- server->ops->handle_cancelled_mid(
- mids[i]->resp_buf,
- server);
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
@@ -1672,6 +1709,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT;
+ /* default to no multichannel (single server connection) */
+ vol->multichannel = false;
+ vol->max_channels = 1;
+
if (!mountdata)
goto cifs_parse_mount_err;
@@ -1965,6 +2006,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_rdma:
vol->rdma = true;
break;
+ case Opt_multichannel:
+ vol->multichannel = true;
+ break;
+ case Opt_nomultichannel:
+ vol->multichannel = false;
+ break;
case Opt_compress:
vol->compression = UNKNOWN_TYPE;
cifs_dbg(VFS,
@@ -2128,6 +2175,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
vol->max_credits = option;
break;
+ case Opt_max_channels:
+ if (get_option_ul(args, &option) || option < 1 ||
+ option > CIFS_MAX_CHANNELS) {
+ cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n",
+ __func__, CIFS_MAX_CHANNELS);
+ goto cifs_parse_mount_err;
+ }
+ vol->max_channels = option;
+ break;
/* String Arguments */
@@ -2713,7 +2769,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
send_sig(SIGKILL, task, 1);
}
-static struct TCP_Server_Info *
+struct TCP_Server_Info *
cifs_get_tcp_session(struct smb_vol *volume_info)
{
struct TCP_Server_Info *tcp_ses = NULL;
@@ -2772,7 +2828,11 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
sizeof(tcp_ses->srcaddr));
memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
sizeof(tcp_ses->dstaddr));
- generate_random_uuid(tcp_ses->client_guid);
+ if (volume_info->use_client_guid)
+ memcpy(tcp_ses->client_guid, volume_info->client_guid,
+ SMB2_CLIENT_GUID_SIZE);
+ else
+ generate_random_uuid(tcp_ses->client_guid);
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
@@ -2861,6 +2921,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
vol->sectype != ses->sectype)
return 0;
+ /*
+ * If an existing session is limited to less channels than
+ * requested, it should not be reused
+ */
+ if (ses->chan_max < vol->max_channels)
+ return 0;
+
switch (ses->sectype) {
case Kerberos:
if (!uid_eq(vol->cred_uid, ses->cred_uid))
@@ -3031,6 +3098,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
+ /* close any extra channels */
+ if (ses->chan_count > 1) {
+ int i;
+
+ for (i = 1; i < ses->chan_count; i++)
+ cifs_put_tcp_session(ses->chans[i].server, 0);
+ }
+
sesInfoFree(ses);
cifs_put_tcp_session(server, 0);
}
@@ -3277,14 +3352,25 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
ses->sectype = volume_info->sectype;
ses->sign = volume_info->sign;
mutex_lock(&ses->session_mutex);
+
+ /* add server as first channel */
+ ses->chans[0].server = server;
+ ses->chan_count = 1;
+ ses->chan_max = volume_info->multichannel ? volume_info->max_channels:1;
+
rc = cifs_negotiate_protocol(xid, ses);
if (!rc)
rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+
+ /* each channel uses a different signing key */
+ memcpy(ses->chans[0].signkey, ses->smb3signingkey,
+ sizeof(ses->smb3signingkey));
+
mutex_unlock(&ses->session_mutex);
if (rc)
goto get_ses_fail;
- /* success, put it on the list */
+ /* success, put it on the list and add it as first channel */
spin_lock(&cifs_tcp_ses_lock);
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -4700,6 +4786,17 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol,
}
#ifdef CONFIG_CIFS_DFS_UPCALL
+static inline void set_root_tcon(struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon,
+ struct cifs_tcon **root)
+{
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ tcon->remap = cifs_remap(cifs_sb);
+ spin_unlock(&cifs_tcp_ses_lock);
+ *root = tcon;
+}
+
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
{
int rc = 0;
@@ -4801,18 +4898,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
/* Cache out resolved root server */
(void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
root_path + 1, NULL, NULL);
- /*
- * Save root tcon for additional DFS requests to update or create a new
- * DFS cache entry, or even perform DFS failover.
- */
- spin_lock(&cifs_tcp_ses_lock);
- tcon->tc_count++;
- tcon->dfs_path = root_path;
+ kfree(root_path);
root_path = NULL;
- tcon->remap = cifs_remap(cifs_sb);
- spin_unlock(&cifs_tcp_ses_lock);
- root_tcon = tcon;
+ set_root_tcon(cifs_sb, tcon, &root_tcon);
for (count = 1; ;) {
if (!rc && tcon) {
@@ -4849,6 +4938,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
mount_put_conns(cifs_sb, xid, server, ses, tcon);
rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses,
&tcon);
+ /*
+ * Ensure that DFS referrals go through new root server.
+ */
+ if (!rc && tcon &&
+ (tcon->share_flags & (SHI1005_FLAGS_DFS |
+ SHI1005_FLAGS_DFS_ROOT))) {
+ cifs_put_tcon(root_tcon);
+ set_root_tcon(cifs_sb, tcon, &root_tcon);
+ }
}
if (rc) {
if (rc == -EACCES || rc == -EOPNOTSUPP)
@@ -4897,6 +4995,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
cifs_autodisable_serverino(cifs_sb);
out:
free_xid(xid);
+ cifs_try_adding_channels(ses);
return mount_setup_tlink(cifs_sb, ses, tcon);
error:
@@ -5142,7 +5241,7 @@ int
cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
{
int rc = 0;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
@@ -5169,23 +5268,25 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
- struct TCP_Server_Info *server = ses->server;
-
- ses->capabilities = server->capabilities;
- if (linuxExtEnabled == 0)
- ses->capabilities &= (~server->vals->cap_unix);
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
+
+ if (!ses->binding) {
+ ses->capabilities = server->capabilities;
+ if (linuxExtEnabled == 0)
+ ses->capabilities &= (~server->vals->cap_unix);
+
+ if (ses->auth_key.response) {
+ cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
+ ses->auth_key.response);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ ses->auth_key.len = 0;
+ }
+ }
cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n",
server->sec_mode, server->capabilities, server->timeAdj);
- if (ses->auth_key.response) {
- cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
- ses->auth_key.response);
- kfree(ses->auth_key.response);
- ses->auth_key.response = NULL;
- ses->auth_key.len = 0;
- }
-
if (server->ops->sess_setup)
rc = server->ops->sess_setup(xid, ses, nls_info);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 1692c0c6c23a..2faa05860a48 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1317,7 +1317,6 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
int rc;
struct dfs_info3_param ref = {0};
char *mdata = NULL, *devname = NULL;
- bool is_smb3 = tcon->ses->server->vals->header_preamble_size == 0;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct smb_vol vol;
@@ -1344,7 +1343,7 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
goto out;
}
- rc = cifs_setup_volume_info(&vol, mdata, devname, is_smb3);
+ rc = cifs_setup_volume_info(&vol, mdata, devname, false);
kfree(devname);
if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 7ce689d31aa2..f3b79012ff29 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -244,10 +244,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
*oplock = REQ_OPLOCK;
full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!full_path)
+ return -ENOMEM;
if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa7b0fa72bb3..f1fe9c44d298 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -288,6 +288,8 @@ cifs_down_write(struct rw_semaphore *sem)
msleep(10);
}
+static void cifsFileInfo_put_work(struct work_struct *work);
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -325,6 +327,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
cfile->invalidHandle = false;
cfile->tlink = cifs_get_tlink(tlink);
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
+ INIT_WORK(&cfile->put, cifsFileInfo_put_work);
mutex_init(&cfile->fh_mutex);
spin_lock_init(&cfile->file_info_lock);
@@ -375,6 +378,41 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
return cifs_file;
}
+static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
+{
+ struct inode *inode = d_inode(cifs_file->dentry);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct cifsLockInfo *li, *tmp;
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * Delete any outstanding lock records. We'll lose them when the file
+ * is closed anyway.
+ */
+ cifs_down_write(&cifsi->lock_sem);
+ list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
+ list_del(&li->llist);
+ cifs_del_lock_waiters(li);
+ kfree(li);
+ }
+ list_del(&cifs_file->llist->llist);
+ kfree(cifs_file->llist);
+ up_write(&cifsi->lock_sem);
+
+ cifs_put_tlink(cifs_file->tlink);
+ dput(cifs_file->dentry);
+ cifs_sb_deactive(sb);
+ kfree(cifs_file);
+}
+
+static void cifsFileInfo_put_work(struct work_struct *work)
+{
+ struct cifsFileInfo *cifs_file = container_of(work,
+ struct cifsFileInfo, put);
+
+ cifsFileInfo_put_final(cifs_file);
+}
+
/**
* cifsFileInfo_put - release a reference of file priv data
*
@@ -382,15 +420,15 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
- _cifsFileInfo_put(cifs_file, true);
+ _cifsFileInfo_put(cifs_file, true, true);
}
/**
* _cifsFileInfo_put - release a reference of file priv data
*
* This may involve closing the filehandle @cifs_file out on the
- * server. Must be called without holding tcon->open_file_lock and
- * cifs_file->file_info_lock.
+ * server. Must be called without holding tcon->open_file_lock,
+ * cinode->open_file_lock and cifs_file->file_info_lock.
*
* If @wait_for_oplock_handler is true and we are releasing the last
* reference, wait for any running oplock break handler of the file
@@ -398,7 +436,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
* oplock break handler, you need to pass false.
*
*/
-void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
+ bool wait_oplock_handler, bool offload)
{
struct inode *inode = d_inode(cifs_file->dentry);
struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
@@ -406,7 +445,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct super_block *sb = inode->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct cifsLockInfo *li, *tmp;
struct cifs_fid fid;
struct cifs_pending_open open;
bool oplock_break_cancelled;
@@ -467,24 +505,10 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
cifs_del_pending_open(&open);
- /*
- * Delete any outstanding lock records. We'll lose them when the file
- * is closed anyway.
- */
- cifs_down_write(&cifsi->lock_sem);
- list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
- list_del(&li->llist);
- cifs_del_lock_waiters(li);
- kfree(li);
- }
- list_del(&cifs_file->llist->llist);
- kfree(cifs_file->llist);
- up_write(&cifsi->lock_sem);
-
- cifs_put_tlink(cifs_file->tlink);
- dput(cifs_file->dentry);
- cifs_sb_deactive(sb);
- kfree(cifs_file);
+ if (offload)
+ queue_work(fileinfo_put_wq, &cifs_file->put);
+ else
+ cifsFileInfo_put_final(cifs_file);
}
int cifs_open(struct inode *inode, struct file *file)
@@ -728,6 +752,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
+ /* O_SYNC also has bit for O_DSYNC so following check picks up either */
+ if (cfile->f_flags & O_SYNC)
+ create_options |= CREATE_WRITE_THROUGH;
+
+ if (cfile->f_flags & O_DIRECT)
+ create_options |= CREATE_NO_BUFFER;
+
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &cfile->fid);
@@ -808,7 +839,7 @@ reopen_error_exit:
int cifs_close(struct inode *inode, struct file *file)
{
if (file->private_data != NULL) {
- cifsFileInfo_put(file->private_data);
+ _cifsFileInfo_put(file->private_data, true, false);
file->private_data = NULL;
}
@@ -1681,7 +1712,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if (flock->fl_flags & FL_POSIX) {
+ if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) {
/*
* If this is a request to remove all locks because we
* are closing the file, it doesn't matter if the
@@ -1698,6 +1729,52 @@ out:
return rc;
}
+int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ int rc, xid;
+ int lock = 0, unlock = 0;
+ bool wait_flag = false;
+ bool posix_lck = false;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *cfile;
+ __u32 type;
+
+ rc = -EACCES;
+ xid = get_xid();
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+
+ cfile = (struct cifsFileInfo *)file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ cifs_read_flock(fl, &type, &lock, &unlock, &wait_flag,
+ tcon->ses->server);
+ cifs_sb = CIFS_FILE_SB(file);
+
+ if (cap_unix(tcon->ses) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+ posix_lck = true;
+
+ if (!lock && !unlock) {
+ /*
+ * if no lock or unlock then nothing to do since we do not
+ * know what it is
+ */
+ free_xid(xid);
+ return -EOPNOTSUPP;
+ }
+
+ rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock,
+ xid);
+ free_xid(xid);
+ return rc;
+
+
+}
+
int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
{
int rc, xid;
@@ -2757,9 +2834,17 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
if (!rc) {
if (wdata->cfile->invalidHandle)
rc = -EAGAIN;
- else
+ else {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (wdata->mr) {
+ wdata->mr->need_invalidate = true;
+ smbd_deregister_mr(wdata->mr);
+ wdata->mr = NULL;
+ }
+#endif
rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
+ }
}
/* If the write was successfully sent, we are done */
@@ -3482,8 +3567,16 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
if (!rc) {
if (rdata->cfile->invalidHandle)
rc = -EAGAIN;
- else
+ else {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (rdata->mr) {
+ rdata->mr->need_invalidate = true;
+ smbd_deregister_mr(rdata->mr);
+ rdata->mr = NULL;
+ }
+#endif
rc = server->ops->async_readv(rdata);
+ }
}
/* If the read was successfully sent, we are done */
@@ -4637,12 +4730,13 @@ void cifs_oplock_break(struct work_struct *work)
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
+ bool purge_cache = false;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
- server->ops->downgrade_oplock(server, cinode,
- test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
+ server->ops->downgrade_oplock(server, cinode, cfile->oplock_level,
+ cfile->oplock_epoch, &purge_cache);
if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
cifs_has_mand_locks(cinode)) {
@@ -4657,18 +4751,21 @@ void cifs_oplock_break(struct work_struct *work)
else
break_lease(inode, O_WRONLY);
rc = filemap_fdatawrite(inode->i_mapping);
- if (!CIFS_CACHE_READ(cinode)) {
+ if (!CIFS_CACHE_READ(cinode) || purge_cache) {
rc = filemap_fdatawait(inode->i_mapping);
mapping_set_error(inode->i_mapping, rc);
cifs_zap_mapping(inode);
}
cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
+ if (CIFS_CACHE_WRITE(cinode))
+ goto oplock_break_ack;
}
rc = cifs_push_locks(cfile);
if (rc)
cifs_dbg(VFS, "Push locks rc = %d\n", rc);
+oplock_break_ack:
/*
* releasing stale oplock after recent reconnect of smb session using
* a now incorrect file handle is not a data integrity issue but do
@@ -4680,7 +4777,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
- _cifsFileInfo_put(cfile, false /* do not wait for ourself */);
+ _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index df9377828e2f..8a76195e8a69 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -727,22 +727,138 @@ static __u64 simple_hashstr(const char *str)
return hash;
}
+/**
+ * cifs_backup_query_path_info - SMB1 fallback code to get ino
+ *
+ * Fallback code to get file metadata when we don't have access to
+ * @full_path (EACCESS) and have backup creds.
+ *
+ * @data will be set to search info result buffer
+ * @resp_buf will be set to cifs resp buf and needs to be freed with
+ * cifs_buf_release() when done with @data.
+ */
+static int
+cifs_backup_query_path_info(int xid,
+ struct cifs_tcon *tcon,
+ struct super_block *sb,
+ const char *full_path,
+ void **resp_buf,
+ FILE_ALL_INFO **data)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_search_info info = {0};
+ u16 flags;
+ int rc;
+
+ *resp_buf = NULL;
+ info.endOfSearch = false;
+ if (tcon->unix_ext)
+ info.info_level = SMB_FIND_FILE_UNIX;
+ else if ((tcon->ses->capabilities &
+ tcon->ses->server->vals->cap_nt_find) == 0)
+ info.info_level = SMB_FIND_FILE_INFO_STANDARD;
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+ info.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+ else /* no srvino useful for fallback to some netapp */
+ info.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
+
+ flags = CIFS_SEARCH_CLOSE_ALWAYS |
+ CIFS_SEARCH_CLOSE_AT_END |
+ CIFS_SEARCH_BACKUP_SEARCH;
+
+ rc = CIFSFindFirst(xid, tcon, full_path,
+ cifs_sb, NULL, flags, &info, false);
+ if (rc)
+ return rc;
+
+ *resp_buf = (void *)info.ntwrk_buf_start;
+ *data = (FILE_ALL_INFO *)info.srch_entries_start;
+ return 0;
+}
+
+static void
+cifs_set_fattr_ino(int xid,
+ struct cifs_tcon *tcon,
+ struct super_block *sb,
+ struct inode **inode,
+ const char *full_path,
+ FILE_ALL_INFO *data,
+ struct cifs_fattr *fattr)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct TCP_Server_Info *server = tcon->ses->server;
+ int rc;
+
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+ if (*inode)
+ fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ else
+ fattr->cf_uniqueid = iunique(sb, ROOT_I);
+ return;
+ }
+
+ /*
+ * If we have an inode pass a NULL tcon to ensure we don't
+ * make a round trip to the server. This only works for SMB2+.
+ */
+ rc = server->ops->get_srv_inum(xid,
+ *inode ? NULL : tcon,
+ cifs_sb, full_path,
+ &fattr->cf_uniqueid,
+ data);
+ if (rc) {
+ /*
+ * If that fails reuse existing ino or generate one
+ * and disable server ones
+ */
+ if (*inode)
+ fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ else {
+ fattr->cf_uniqueid = iunique(sb, ROOT_I);
+ cifs_autodisable_serverino(cifs_sb);
+ }
+ return;
+ }
+
+ /* If no errors, check for zero root inode (invalid) */
+ if (fattr->cf_uniqueid == 0 && strlen(full_path) == 0) {
+ cifs_dbg(FYI, "Invalid (0) inodenum\n");
+ if (*inode) {
+ /* reuse */
+ fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ } else {
+ /* make an ino by hashing the UNC */
+ fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO;
+ fattr->cf_uniqueid = simple_hashstr(tcon->treeName);
+ }
+ }
+}
+
+static inline bool is_inode_cache_good(struct inode *ino)
+{
+ return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0;
+}
+
int
-cifs_get_inode_info(struct inode **inode, const char *full_path,
- FILE_ALL_INFO *data, struct super_block *sb, int xid,
+cifs_get_inode_info(struct inode **inode,
+ const char *full_path,
+ FILE_ALL_INFO *in_data,
+ struct super_block *sb, int xid,
const struct cifs_fid *fid)
{
- __u16 srchflgs;
- int rc = 0, tmprc = ENOSYS;
+
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- char *buf = NULL;
bool adjust_tz = false;
- struct cifs_fattr fattr;
- struct cifs_search_info *srchinf = NULL;
+ struct cifs_fattr fattr = {0};
bool symlink = false;
+ FILE_ALL_INFO *data = in_data;
+ FILE_ALL_INFO *tmp_data = NULL;
+ void *smb1_backup_rsp_buf = NULL;
+ int rc = 0;
+ int tmprc = 0;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -750,142 +866,88 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
- cifs_dbg(FYI, "Getting info on %s\n", full_path);
+ /*
+ * 1. Fetch file metadata if not provided (data)
+ */
- if ((data == NULL) && (*inode != NULL)) {
- if (CIFS_CACHE_READ(CIFS_I(*inode)) &&
- CIFS_I(*inode)->time != 0) {
+ if (!data) {
+ if (is_inode_cache_good(*inode)) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
- goto cgii_exit;
- }
- }
-
- /* if inode info is not passed, get it from server */
- if (data == NULL) {
- if (!server->ops->query_path_info) {
- rc = -ENOSYS;
- goto cgii_exit;
+ goto out;
}
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
+ tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (!tmp_data) {
rc = -ENOMEM;
- goto cgii_exit;
+ goto out;
}
- data = (FILE_ALL_INFO *)buf;
- rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path,
- data, &adjust_tz, &symlink);
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb,
+ full_path, tmp_data,
+ &adjust_tz, &symlink);
+ data = tmp_data;
}
- if (!rc) {
- cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
- symlink);
- } else if (rc == -EREMOTE) {
+ /*
+ * 2. Convert it to internal cifs metadata (fattr)
+ */
+
+ switch (rc) {
+ case 0:
+ cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink);
+ break;
+ case -EREMOTE:
+ /* DFS link, no metadata available on this server */
cifs_create_dfs_fattr(&fattr, sb);
rc = 0;
- } else if ((rc == -EACCES) && backup_cred(cifs_sb) &&
- (strcmp(server->vals->version_string, SMB1_VERSION_STRING)
- == 0)) {
+ break;
+ case -EACCES:
/*
- * For SMB2 and later the backup intent flag is already
- * sent if needed on open and there is no path based
- * FindFirst operation to use to retry with
+ * perm errors, try again with backup flags if possible
+ *
+ * For SMB2 and later the backup intent flag
+ * is already sent if needed on open and there
+ * is no path based FindFirst operation to use
+ * to retry with
*/
+ if (backup_cred(cifs_sb) && is_smb1_server(server)) {
+ /* for easier reading */
+ FILE_DIRECTORY_INFO *fdi;
+ SEARCH_ID_FULL_DIR_INFO *si;
+
+ rc = cifs_backup_query_path_info(xid, tcon, sb,
+ full_path,
+ &smb1_backup_rsp_buf,
+ &data);
+ if (rc)
+ goto out;
- srchinf = kzalloc(sizeof(struct cifs_search_info),
- GFP_KERNEL);
- if (srchinf == NULL) {
- rc = -ENOMEM;
- goto cgii_exit;
- }
+ fdi = (FILE_DIRECTORY_INFO *)data;
+ si = (SEARCH_ID_FULL_DIR_INFO *)data;
- srchinf->endOfSearch = false;
- if (tcon->unix_ext)
- srchinf->info_level = SMB_FIND_FILE_UNIX;
- else if ((tcon->ses->capabilities &
- tcon->ses->server->vals->cap_nt_find) == 0)
- srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD;
- else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
- srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
- else /* no srvino useful for fallback to some netapp */
- srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO;
-
- srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
- CIFS_SEARCH_CLOSE_AT_END |
- CIFS_SEARCH_BACKUP_SEARCH;
-
- rc = CIFSFindFirst(xid, tcon, full_path,
- cifs_sb, NULL, srchflgs, srchinf, false);
- if (!rc) {
- data = (FILE_ALL_INFO *)srchinf->srch_entries_start;
+ cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb);
+ fattr.cf_uniqueid = le64_to_cpu(si->UniqueId);
+ /* uniqueid set, skip get inum step */
+ goto handle_mnt_opt;
+ } else {
+ /* nothing we can do, bail out */
+ goto out;
+ }
+ break;
+ default:
+ cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc);
+ goto out;
+ }
- cifs_dir_info_to_fattr(&fattr,
- (FILE_DIRECTORY_INFO *)data, cifs_sb);
- fattr.cf_uniqueid = le64_to_cpu(
- ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
+ /*
+ * 3. Get or update inode number (fattr.cf_uniqueid)
+ */
- cifs_buf_release(srchinf->ntwrk_buf_start);
- }
- kfree(srchinf);
- if (rc)
- goto cgii_exit;
- } else
- goto cgii_exit;
+ cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, &fattr);
/*
- * If an inode wasn't passed in, then get the inode number
- *
- * Is an i_ino of zero legal? Can we use that to check if the server
- * supports returning inode numbers? Are there other sanity checks we
- * can use to ensure that the server is really filling in that field?
+ * 4. Tweak fattr based on mount options
*/
- if (*inode == NULL) {
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
- if (server->ops->get_srv_inum)
- tmprc = server->ops->get_srv_inum(xid,
- tcon, cifs_sb, full_path,
- &fattr.cf_uniqueid, data);
- if (tmprc) {
- cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
- tmprc);
- fattr.cf_uniqueid = iunique(sb, ROOT_I);
- cifs_autodisable_serverino(cifs_sb);
- } else if ((fattr.cf_uniqueid == 0) &&
- strlen(full_path) == 0) {
- /* some servers ret bad root ino ie 0 */
- cifs_dbg(FYI, "Invalid (0) inodenum\n");
- fattr.cf_flags |=
- CIFS_FATTR_FAKE_ROOT_INO;
- fattr.cf_uniqueid =
- simple_hashstr(tcon->treeName);
- }
- } else
- fattr.cf_uniqueid = iunique(sb, ROOT_I);
- } else {
- if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
- && server->ops->get_srv_inum) {
- /*
- * Pass a NULL tcon to ensure we don't make a round
- * trip to the server. This only works for SMB2+.
- */
- tmprc = server->ops->get_srv_inum(xid,
- NULL, cifs_sb, full_path,
- &fattr.cf_uniqueid, data);
- if (tmprc)
- fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
- else if ((fattr.cf_uniqueid == 0) &&
- strlen(full_path) == 0) {
- /*
- * Reuse existing root inode num since
- * inum zero for root causes ls of . and .. to
- * not be returned
- */
- cifs_dbg(FYI, "Srv ret 0 inode num for root\n");
- fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
- }
- } else
- fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
- }
+handle_mnt_opt:
/* query for SFU type info if supported and needed */
if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -900,8 +962,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
full_path, fid);
if (rc) {
cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n",
- __func__, rc);
- goto cgii_exit;
+ __func__, rc);
+ goto out;
}
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false,
@@ -909,7 +971,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
if (rc) {
cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
__func__, rc);
- goto cgii_exit;
+ goto out;
}
}
@@ -925,6 +987,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
+ /*
+ * 5. Update inode with final fattr data
+ */
+
if (!*inode) {
*inode = cifs_iget(sb, &fattr);
if (!*inode)
@@ -937,7 +1003,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
- goto cgii_exit;
+ goto out;
}
/* if filetype is different, return error */
@@ -945,18 +1011,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
(fattr.cf_mode & S_IFMT))) {
CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
- goto cgii_exit;
+ goto out;
}
cifs_fattr_to_inode(*inode, &fattr);
}
-
-cgii_exit:
- if ((*inode) && ((*inode)->i_ino == 0))
- cifs_dbg(FYI, "inode number of zero returned\n");
-
- kfree(buf);
+out:
+ cifs_buf_release(smb1_backup_rsp_buf);
cifs_put_tlink(tlink);
+ kfree(tmp_data);
return rc;
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 5ad83bdb9bea..40ca394fd5de 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -488,21 +488,10 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
&pCifsInode->flags);
- /*
- * Set flag if the server downgrades the oplock
- * to L2 else clear.
- */
- if (pSMB->OplockLevel)
- set_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &pCifsInode->flags);
- else
- clear_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &pCifsInode->flags);
-
- cifs_queue_oplock_break(netfile);
+ netfile->oplock_epoch = 0;
+ netfile->oplock_level = pSMB->OplockLevel;
netfile->oplock_break_cancelled = false;
+ cifs_queue_oplock_break(netfile);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 85bd644f9773..fb3bdc44775c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -31,6 +31,231 @@
#include <linux/utsname.h>
#include <linux/slab.h>
#include "cifs_spnego.h"
+#include "smb2proto.h"
+
+bool
+is_server_using_iface(struct TCP_Server_Info *server,
+ struct cifs_server_iface *iface)
+{
+ struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr;
+ struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr;
+ struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr;
+ struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr;
+
+ if (server->dstaddr.ss_family != iface->sockaddr.ss_family)
+ return false;
+ if (server->dstaddr.ss_family == AF_INET) {
+ if (s4->sin_addr.s_addr != i4->sin_addr.s_addr)
+ return false;
+ } else if (server->dstaddr.ss_family == AF_INET6) {
+ if (memcmp(&s6->sin6_addr, &i6->sin6_addr,
+ sizeof(i6->sin6_addr)) != 0)
+ return false;
+ } else {
+ /* unknown family.. */
+ return false;
+ }
+ return true;
+}
+
+bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
+{
+ int i;
+
+ for (i = 0; i < ses->chan_count; i++) {
+ if (is_server_using_iface(ses->chans[i].server, iface))
+ return true;
+ }
+ return false;
+}
+
+/* returns number of channels added */
+int cifs_try_adding_channels(struct cifs_ses *ses)
+{
+ int old_chan_count = ses->chan_count;
+ int left = ses->chan_max - ses->chan_count;
+ int i = 0;
+ int rc = 0;
+ int tries = 0;
+
+ if (left <= 0) {
+ cifs_dbg(FYI,
+ "ses already at max_channels (%zu), nothing to open\n",
+ ses->chan_max);
+ return 0;
+ }
+
+ if (ses->server->dialect < SMB30_PROT_ID) {
+ cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n");
+ return 0;
+ }
+
+ /*
+ * Keep connecting to same, fastest, iface for all channels as
+ * long as its RSS. Try next fastest one if not RSS or channel
+ * creation fails.
+ */
+ while (left > 0) {
+ struct cifs_server_iface *iface;
+
+ tries++;
+ if (tries > 3*ses->chan_max) {
+ cifs_dbg(FYI, "too many attempt at opening channels (%d channels left to open)\n",
+ left);
+ break;
+ }
+
+ iface = &ses->iface_list[i];
+ if (is_ses_using_iface(ses, iface) && !iface->rss_capable) {
+ i = (i+1) % ses->iface_count;
+ continue;
+ }
+
+ rc = cifs_ses_add_channel(ses, iface);
+ if (rc) {
+ cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n",
+ i, rc);
+ i = (i+1) % ses->iface_count;
+ continue;
+ }
+
+ cifs_dbg(FYI, "successfully opened new channel on iface#%d\n",
+ i);
+ left--;
+ }
+
+ return ses->chan_count - old_chan_count;
+}
+
+int
+cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface)
+{
+ struct cifs_chan *chan;
+ struct smb_vol vol = {NULL};
+ static const char unc_fmt[] = "\\%s\\foo";
+ char unc[sizeof(unc_fmt)+SERVER_NAME_LEN_WITH_NULL] = {0};
+ struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
+ struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr;
+ int rc;
+ unsigned int xid = get_xid();
+
+ cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ",
+ ses, iface->speed, iface->rdma_capable ? "yes" : "no");
+ if (iface->sockaddr.ss_family == AF_INET)
+ cifs_dbg(FYI, "ip:%pI4)\n", &ipv4->sin_addr);
+ else
+ cifs_dbg(FYI, "ip:%pI6)\n", &ipv6->sin6_addr);
+
+ /*
+ * Setup a smb_vol with mostly the same info as the existing
+ * session and overwrite it with the requested iface data.
+ *
+ * We need to setup at least the fields used for negprot and
+ * sesssetup.
+ *
+ * We only need the volume here, so we can reuse memory from
+ * the session and server without caring about memory
+ * management.
+ */
+
+ /* Always make new connection for now (TODO?) */
+ vol.nosharesock = true;
+
+ /* Auth */
+ vol.domainauto = ses->domainAuto;
+ vol.domainname = ses->domainName;
+ vol.username = ses->user_name;
+ vol.password = ses->password;
+ vol.sectype = ses->sectype;
+ vol.sign = ses->sign;
+
+ /* UNC and paths */
+ /* XXX: Use ses->server->hostname? */
+ sprintf(unc, unc_fmt, ses->serverName);
+ vol.UNC = unc;
+ vol.prepath = "";
+
+ /* Re-use same version as master connection */
+ vol.vals = ses->server->vals;
+ vol.ops = ses->server->ops;
+
+ vol.noblocksnd = ses->server->noblocksnd;
+ vol.noautotune = ses->server->noautotune;
+ vol.sockopt_tcp_nodelay = ses->server->tcp_nodelay;
+ vol.echo_interval = ses->server->echo_interval / HZ;
+
+ /*
+ * This will be used for encoding/decoding user/domain/pw
+ * during sess setup auth.
+ *
+ * XXX: We use the default for simplicity but the proper way
+ * would be to use the one that ses used, which is not
+ * stored. This might break when dealing with non-ascii
+ * strings.
+ */
+ vol.local_nls = load_nls_default();
+
+ /* Use RDMA if possible */
+ vol.rdma = iface->rdma_capable;
+ memcpy(&vol.dstaddr, &iface->sockaddr, sizeof(struct sockaddr_storage));
+
+ /* reuse master con client guid */
+ memcpy(&vol.client_guid, ses->server->client_guid,
+ SMB2_CLIENT_GUID_SIZE);
+ vol.use_client_guid = true;
+
+ mutex_lock(&ses->session_mutex);
+
+ chan = &ses->chans[ses->chan_count];
+ chan->server = cifs_get_tcp_session(&vol);
+ if (IS_ERR(chan->server)) {
+ rc = PTR_ERR(chan->server);
+ chan->server = NULL;
+ goto out;
+ }
+
+ /*
+ * We need to allocate the server crypto now as we will need
+ * to sign packets before we generate the channel signing key
+ * (we sign with the session key)
+ */
+ rc = smb311_crypto_shash_allocate(chan->server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+ goto out;
+ }
+
+ ses->binding = true;
+ rc = cifs_negotiate_protocol(xid, ses);
+ if (rc)
+ goto out;
+
+ rc = cifs_setup_session(xid, ses, vol.local_nls);
+ if (rc)
+ goto out;
+
+ /* success, put it on the list
+ * XXX: sharing ses between 2 tcp server is not possible, the
+ * way "internal" linked lists works in linux makes element
+ * only able to belong to one list
+ *
+ * the binding session is already established so the rest of
+ * the code should be able to look it up, no need to add the
+ * ses to the new server.
+ */
+
+ ses->chan_count++;
+ atomic_set(&ses->chan_seq, 0);
+out:
+ ses->binding = false;
+ mutex_unlock(&ses->session_mutex);
+
+ if (rc && chan->server)
+ cifs_put_tcp_session(chan->server, 0);
+ unload_nls(vol.local_nls);
+
+ return rc;
+}
static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
{
@@ -342,6 +567,7 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
struct cifs_ses *ses)
{
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
__u32 flags;
@@ -354,9 +580,9 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
NTLMSSP_NEGOTIATE_SEAL;
- if (ses->server->sign)
+ if (server->sign)
flags |= NTLMSSP_NEGOTIATE_SIGN;
- if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
+ if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
sec_blob->NegotiateFlags = cpu_to_le32(flags);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 514810694c0f..d70a2bb062df 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -369,12 +369,10 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
static void
cifs_downgrade_oplock(struct TCP_Server_Info *server,
- struct cifsInodeInfo *cinode, bool set_level2)
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache)
{
- if (set_level2)
- cifs_set_oplock_level(cinode, OPLOCK_READ);
- else
- cifs_set_oplock_level(cinode, 0);
+ cifs_set_oplock_level(cinode, oplock);
}
static bool
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index e311f58dc1c8..0516fc482d43 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -29,6 +29,7 @@
#include "cifs_unicode.h"
#include "smb2status.h"
#include "smb2glob.h"
+#include "nterr.h"
static int
check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
@@ -249,16 +250,10 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
* of junk. Other servers match RFC1001 len to actual
* SMB2/SMB3 frame length (header + smb2 response specific data)
* Some windows servers also pad up to 8 bytes when compounding.
- * If pad is longer than eight bytes, log the server behavior
- * (once), since may indicate a problem but allow it and continue
- * since the frame is parseable.
*/
- if (clc_len < len) {
- pr_warn_once(
- "srv rsp padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
- len, clc_len, command, mid);
+ if (clc_len < len)
return 0;
- }
+
pr_warn_once(
"srv rsp too short, len %d not %d. cmd:%d mid:%llu\n",
len, clc_len, command, mid);
@@ -534,7 +529,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "found in the open list\n");
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
- le32_to_cpu(rsp->NewLeaseState));
+ lease_state);
if (ack_req)
cfile->oplock_break_cancelled = false;
@@ -543,17 +538,8 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
- /*
- * Set or clear flags depending on the lease state being READ.
- * HANDLE caching flag should be added when the client starts
- * to defer closing remote file handles with HANDLE leases.
- */
- if (lease_state & SMB2_LEASE_READ_CACHING_HE)
- set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
- else
- clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
+ cfile->oplock_epoch = le16_to_cpu(rsp->Epoch);
+ cfile->oplock_level = lease_state;
cifs_queue_oplock_break(cfile);
kfree(lw);
@@ -576,7 +562,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "found in the pending open list\n");
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
- le32_to_cpu(rsp->NewLeaseState));
+ lease_state);
open->oplock = lease_state;
}
@@ -673,10 +659,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
- cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
list_for_each(tmp2, &tcon->openFileList) {
cfile = list_entry(tmp2, struct cifsFileInfo,
@@ -688,6 +674,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
+ cifs_stats_inc(
+ &tcon->stats.cifs_stats.num_oplock_brks);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_lock(&cfile->file_info_lock);
if (!CIFS_CACHE_WRITE(cinode) &&
@@ -699,18 +687,9 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
&cinode->flags);
- /*
- * Set flag if the server downgrades the oplock
- * to L2 else clear.
- */
- if (rsp->OplockLevel)
- set_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
- else
- clear_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
+ cfile->oplock_epoch = 0;
+ cfile->oplock_level = rsp->OplockLevel;
+
spin_unlock(&cfile->file_info_lock);
cifs_queue_oplock_break(cfile);
@@ -720,9 +699,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
return true;
}
spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- cifs_dbg(FYI, "No matching file for oplock break\n");
- return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -735,45 +711,98 @@ smb2_cancelled_close_fid(struct work_struct *work)
{
struct close_cancelled_open *cancelled = container_of(work,
struct close_cancelled_open, work);
+ struct cifs_tcon *tcon = cancelled->tcon;
+ int rc;
- cifs_dbg(VFS, "Close unmatched open\n");
+ if (cancelled->mid)
+ cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llx\n",
+ cancelled->mid);
+ else
+ cifs_tcon_dbg(VFS, "Close interrupted close\n");
- SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid,
- cancelled->fid.volatile_fid);
- cifs_put_tcon(cancelled->tcon);
+ rc = SMB2_close(0, tcon, cancelled->fid.persistent_fid,
+ cancelled->fid.volatile_fid);
+ if (rc)
+ cifs_tcon_dbg(VFS, "Close cancelled mid failed rc:%d\n", rc);
+
+ cifs_put_tcon(tcon);
kfree(cancelled);
}
+/*
+ * Caller should already has an extra reference to @tcon
+ * This function is used to queue work to close a handle to prevent leaks
+ * on the server.
+ * We handle two cases. If an open was interrupted after we sent the
+ * SMB2_CREATE to the server but before we processed the reply, and second
+ * if a close was interrupted before we sent the SMB2_CLOSE to the server.
+ */
+static int
+__smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid,
+ __u64 persistent_fid, __u64 volatile_fid)
+{
+ struct close_cancelled_open *cancelled;
+
+ cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
+ if (!cancelled)
+ return -ENOMEM;
+
+ cancelled->fid.persistent_fid = persistent_fid;
+ cancelled->fid.volatile_fid = volatile_fid;
+ cancelled->tcon = tcon;
+ cancelled->cmd = cmd;
+ cancelled->mid = mid;
+ INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
+ WARN_ON(queue_work(cifsiod_wq, &cancelled->work) == false);
+
+ return 0;
+}
+
+int
+smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
+ __u64 volatile_fid)
+{
+ int rc;
+
+ cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ rc = __smb2_handle_cancelled_cmd(tcon, SMB2_CLOSE_HE, 0,
+ persistent_fid, volatile_fid);
+ if (rc)
+ cifs_put_tcon(tcon);
+
+ return rc;
+}
+
int
smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
struct cifs_tcon *tcon;
- struct close_cancelled_open *cancelled;
+ int rc;
if (sync_hdr->Command != SMB2_CREATE ||
sync_hdr->Status != STATUS_SUCCESS)
return 0;
- cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
- if (!cancelled)
- return -ENOMEM;
-
tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
sync_hdr->TreeId);
- if (!tcon) {
- kfree(cancelled);
+ if (!tcon)
return -ENOENT;
- }
- cancelled->fid.persistent_fid = rsp->PersistentFileId;
- cancelled->fid.volatile_fid = rsp->VolatileFileId;
- cancelled->tcon = tcon;
- INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
- queue_work(cifsiod_wq, &cancelled->work);
+ rc = __smb2_handle_cancelled_cmd(tcon,
+ le16_to_cpu(sync_hdr->Command),
+ le64_to_cpu(sync_hdr->MessageId),
+ rsp->PersistentFileId,
+ rsp->VolatileFileId);
+ if (rc)
+ cifs_put_tcon(tcon);
- return 0;
+ return rc;
}
/**
@@ -788,23 +817,37 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
int i, rc;
struct sdesc *d;
struct smb2_sync_hdr *hdr;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
- if (ses->server->tcpStatus == CifsGood) {
- /* skip non smb311 connections */
- if (ses->server->dialect != SMB311_PROT_ID)
- return 0;
+ hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ /* neg prot are always taken */
+ if (hdr->Command == SMB2_NEGOTIATE)
+ goto ok;
- /* skip last sess setup response */
- hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
- if (hdr->Flags & SMB2_FLAGS_SIGNED)
- return 0;
- }
+ /*
+ * If we process a command which wasn't a negprot it means the
+ * neg prot was already done, so the server dialect was set
+ * and we can test it. Preauth requires 3.1.1 for now.
+ */
+ if (server->dialect != SMB311_PROT_ID)
+ return 0;
+
+ if (hdr->Command != SMB2_SESSION_SETUP)
+ return 0;
+
+ /* skip last sess setup response */
+ if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
+ && (hdr->Status == NT_STATUS_OK
+ || (hdr->Status !=
+ cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))))
+ return 0;
- rc = smb311_crypto_shash_allocate(ses->server);
+ok:
+ rc = smb311_crypto_shash_allocate(server);
if (rc)
return rc;
- d = ses->server->secmech.sdescsha512;
+ d = server->secmech.sdescsha512;
rc = crypto_shash_init(&d->shash);
if (rc) {
cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index cd55af9b7cc5..a7f328f79c6f 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -10,6 +10,7 @@
#include <linux/falloc.h>
#include <linux/scatterlist.h>
#include <linux/uuid.h>
+#include <linux/sort.h>
#include <crypto/aead.h>
#include "cifsglob.h"
#include "smb2pdu.h"
@@ -151,13 +152,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
static unsigned int
smb2_get_credits(struct mid_q_entry *mid)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
-
- if (mid->mid_state == MID_RESPONSE_RECEIVED
- || mid->mid_state == MID_RESPONSE_MALFORMED)
- return le16_to_cpu(shdr->CreditRequest);
-
- return 0;
+ return mid->credits_received;
}
static int
@@ -315,7 +310,7 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
{
int rc;
- ses->server->CurrentMid = 0;
+ cifs_ses_server(ses)->CurrentMid = 0;
rc = SMB2_negotiate(xid, ses);
/* BB we probably don't need to retry with modern servers */
if (rc == -EAGAIN)
@@ -558,6 +553,13 @@ out:
return rc;
}
+static int compare_iface(const void *ia, const void *ib)
+{
+ const struct cifs_server_iface *a = (struct cifs_server_iface *)ia;
+ const struct cifs_server_iface *b = (struct cifs_server_iface *)ib;
+
+ return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1);
+}
static int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
@@ -587,6 +589,9 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
if (rc)
goto out;
+ /* sort interfaces from fastest to slowest */
+ sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL);
+
spin_lock(&ses->iface_lock);
kfree(ses->iface_list);
ses->iface_list = iface_list;
@@ -1402,15 +1407,10 @@ smb2_ioctl_query_info(const unsigned int xid,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
-
- if (copy_from_user(buffer, arg + sizeof(struct smb_query_info),
- qi.output_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ buffer = memdup_user(arg + sizeof(struct smb_query_info),
+ qi.output_buffer_length);
+ if (IS_ERR(buffer))
+ return PTR_ERR(buffer);
/* Open */
memset(&open_iov, 0, sizeof(open_iov));
@@ -1529,35 +1529,32 @@ smb2_ioctl_query_info(const unsigned int xid,
if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length)
qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
if (qi.input_buffer_length > 0 &&
- le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length > rsp_iov[1].iov_len) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length
+ > rsp_iov[1].iov_len)
+ goto e_fault;
+
+ if (copy_to_user(&pqi->input_buffer_length,
+ &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length)))
+ goto e_fault;
+
if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info),
(const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset),
- qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ qi.input_buffer_length))
+ goto e_fault;
} else {
pqi = (struct smb_query_info __user *)arg;
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length)
qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ if (copy_to_user(&pqi->input_buffer_length,
+ &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length)))
+ goto e_fault;
+
+ if (copy_to_user(pqi + 1, qi_rsp->Buffer,
+ qi.input_buffer_length))
+ goto e_fault;
}
iqinf_exit:
@@ -1573,6 +1570,10 @@ smb2_ioctl_query_info(const unsigned int xid,
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
return rc;
+
+e_fault:
+ rc = -EFAULT;
+ goto iqinf_exit;
}
static ssize_t
@@ -3281,22 +3282,38 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
static void
smb2_downgrade_oplock(struct TCP_Server_Info *server,
- struct cifsInodeInfo *cinode, bool set_level2)
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache)
{
- if (set_level2)
- server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
- 0, NULL);
- else
- server->ops->set_oplock_level(cinode, 0, 0, NULL);
+ server->ops->set_oplock_level(cinode, oplock, 0, NULL);
}
static void
-smb21_downgrade_oplock(struct TCP_Server_Info *server,
- struct cifsInodeInfo *cinode, bool set_level2)
+smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache);
+
+static void
+smb3_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache)
{
- server->ops->set_oplock_level(cinode,
- set_level2 ? SMB2_LEASE_READ_CACHING_HE :
- 0, 0, NULL);
+ unsigned int old_state = cinode->oplock;
+ unsigned int old_epoch = cinode->epoch;
+ unsigned int new_state;
+
+ if (epoch > old_epoch) {
+ smb21_set_oplock_level(cinode, oplock, 0, NULL);
+ cinode->epoch = epoch;
+ }
+
+ new_state = cinode->oplock;
+ *purge_cache = false;
+
+ if ((old_state & CIFS_CACHE_READ_FLG) != 0 &&
+ (new_state & CIFS_CACHE_READ_FLG) == 0)
+ *purge_cache = true;
+ else if (old_state == new_state && (epoch - old_epoch > 1))
+ *purge_cache = true;
}
static void
@@ -3598,14 +3615,16 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
u8 *ses_enc_key;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- if (ses->Suid != ses_id)
- continue;
- ses_enc_key = enc ? ses->smb3encryptionkey :
- ses->smb3decryptionkey;
- memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE);
- spin_unlock(&cifs_tcp_ses_lock);
- return 0;
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id) {
+ ses_enc_key = enc ? ses->smb3encryptionkey :
+ ses->smb3decryptionkey;
+ memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
+ }
+ }
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -4556,7 +4575,7 @@ struct smb_version_operations smb21_operations = {
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb21_downgrade_oplock,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -4656,7 +4675,7 @@ struct smb_version_operations smb30_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb21_downgrade_oplock,
+ .downgrade_oplock = smb3_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -4764,7 +4783,7 @@ struct smb_version_operations smb311_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb21_downgrade_oplock,
+ .downgrade_oplock = smb3_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 05149862aea4..ed77f94dbf1d 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -252,7 +252,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (tcon == NULL)
return 0;
- if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
+ if (smb2_command == SMB2_TREE_CONNECT)
return 0;
if (tcon->tidStatus == CifsExiting) {
@@ -426,16 +426,9 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf,
* SMB information in the SMB header. If the return code is zero, this
* function must have filled in request_buf pointer.
*/
-static int
-smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
- void **request_buf, unsigned int *total_len)
+static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
{
- int rc;
-
- rc = smb2_reconnect(smb2_command, tcon);
- if (rc)
- return rc;
-
/* BB eventually switch this to SMB2 specific small buf size */
if (smb2_command == SMB2_SET_INFO)
*request_buf = cifs_buf_get();
@@ -456,7 +449,31 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
cifs_stats_inc(&tcon->num_smbs_sent);
}
- return rc;
+ return 0;
+}
+
+static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
+{
+ int rc;
+
+ rc = smb2_reconnect(smb2_command, tcon);
+ if (rc)
+ return rc;
+
+ return __smb2_plain_req_init(smb2_command, tcon, request_buf,
+ total_len);
+}
+
+static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
+{
+ /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */
+ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) {
+ return __smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf,
+ total_len);
+ }
+ return smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, total_len);
}
/* For explanation of negotiate contexts see MS-SMB2 section 2.2.3.1 */
@@ -791,7 +808,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
struct kvec rsp_iov;
int rc = 0;
int resp_buftype;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
int blob_offset, blob_length;
char *security_blob;
int flags = CIFS_NEG_OP;
@@ -813,7 +830,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
- if (strcmp(ses->server->vals->version_string,
+ if (strcmp(server->vals->version_string,
SMB3ANY_VERSION_STRING) == 0) {
req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
@@ -829,7 +846,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
total_len += 8;
} else {
/* otherwise send specific dialect */
- req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
+ req->Dialects[0] = cpu_to_le16(server->vals->protocol_id);
req->DialectCount = cpu_to_le16(1);
total_len += 2;
}
@@ -1171,7 +1188,7 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
int rc;
struct cifs_ses *ses = sess_data->ses;
struct smb2_sess_setup_req *req;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
unsigned int total_len;
rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req,
@@ -1179,13 +1196,21 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
if (rc)
return rc;
- /* First session, not a reauthenticate */
- req->sync_hdr.SessionId = 0;
-
- /* if reconnect, we need to send previous sess id, otherwise it is 0 */
- req->PreviousSessionId = sess_data->previous_session;
-
- req->Flags = 0; /* MBZ */
+ if (sess_data->ses->binding) {
+ req->sync_hdr.SessionId = sess_data->ses->Suid;
+ req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->PreviousSessionId = 0;
+ req->Flags = SMB2_SESSION_REQ_FLAG_BINDING;
+ } else {
+ /* First session, not a reauthenticate */
+ req->sync_hdr.SessionId = 0;
+ /*
+ * if reconnect, we need to send previous sess id
+ * otherwise it is 0
+ */
+ req->PreviousSessionId = sess_data->previous_session;
+ req->Flags = 0; /* MBZ */
+ }
/* enough to enable echos and oplocks and one max size write */
req->sync_hdr.CreditRequest = cpu_to_le16(130);
@@ -1258,28 +1283,33 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
{
int rc = 0;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
- mutex_lock(&ses->server->srv_mutex);
- if (ses->server->ops->generate_signingkey) {
- rc = ses->server->ops->generate_signingkey(ses);
+ mutex_lock(&server->srv_mutex);
+ if (server->ops->generate_signingkey) {
+ rc = server->ops->generate_signingkey(ses);
if (rc) {
cifs_dbg(FYI,
"SMB3 session key generation failed\n");
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
return rc;
}
}
- if (!ses->server->session_estab) {
- ses->server->sequence_number = 0x2;
- ses->server->session_estab = true;
+ if (!server->session_estab) {
+ server->sequence_number = 0x2;
+ server->session_estab = true;
}
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "SMB2/3 session established successfully\n");
- spin_lock(&GlobalMid_Lock);
- ses->status = CifsGood;
- ses->need_reconnect = false;
- spin_unlock(&GlobalMid_Lock);
+ /* keep existing ses state if binding */
+ if (!ses->binding) {
+ spin_lock(&GlobalMid_Lock);
+ ses->status = CifsGood;
+ ses->need_reconnect = false;
+ spin_unlock(&GlobalMid_Lock);
+ }
+
return rc;
}
@@ -1317,16 +1347,19 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
goto out_put_spnego_key;
}
- ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
- GFP_KERNEL);
- if (!ses->auth_key.response) {
- cifs_dbg(VFS,
- "Kerberos can't allocate (%u bytes) memory",
- msg->sesskey_len);
- rc = -ENOMEM;
- goto out_put_spnego_key;
+ /* keep session key if binding */
+ if (!ses->binding) {
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
+ if (!ses->auth_key.response) {
+ cifs_dbg(VFS,
+ "Kerberos can't allocate (%u bytes) memory",
+ msg->sesskey_len);
+ rc = -ENOMEM;
+ goto out_put_spnego_key;
+ }
+ ses->auth_key.len = msg->sesskey_len;
}
- ses->auth_key.len = msg->sesskey_len;
sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
sess_data->iov[1].iov_len = msg->secblob_len;
@@ -1336,9 +1369,11 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
goto out_put_spnego_key;
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->sync_hdr.SessionId;
-
- ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ /* keep session id and flags if binding */
+ if (!ses->binding) {
+ ses->Suid = rsp->sync_hdr.SessionId;
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ }
rc = SMB2_sess_establish_session(sess_data);
out_put_spnego_key:
@@ -1432,9 +1467,11 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
-
- ses->Suid = rsp->sync_hdr.SessionId;
- ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ /* keep existing ses id and flags if binding */
+ if (!ses->binding) {
+ ses->Suid = rsp->sync_hdr.SessionId;
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ }
out:
kfree(ntlmssp_blob);
@@ -1491,8 +1528,11 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->sync_hdr.SessionId;
- ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ /* keep existing ses id and flags if binding */
+ if (!ses->binding) {
+ ses->Suid = rsp->sync_hdr.SessionId;
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ }
rc = SMB2_sess_establish_session(sess_data);
out:
@@ -1509,7 +1549,7 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data)
{
int type;
- type = smb2_select_sectype(ses->server, ses->sectype);
+ type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype);
cifs_dbg(FYI, "sess setup type %d\n", type);
if (type == Unspecified) {
cifs_dbg(VFS,
@@ -1537,7 +1577,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
int rc = 0;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
struct SMB2_sess_data *sess_data;
cifs_dbg(FYI, "Session Setup\n");
@@ -1563,7 +1603,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
/*
* Initialize the session hash with the server one.
*/
- memcpy(ses->preauth_sha_hash, ses->server->preauth_sha_hash,
+ memcpy(ses->preauth_sha_hash, server->preauth_sha_hash,
SMB2_PREAUTH_HASH_SIZE);
while (sess_data->func)
@@ -1807,6 +1847,8 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
return 0;
+ close_shroot(&tcon->crfid);
+
rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req,
&total_len);
if (rc)
@@ -2661,7 +2703,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
int rc;
char *in_data_buf;
- rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
+ rc = smb2_ioctl_req_init(opcode, tcon, (void **) &req, &total_len);
if (rc)
return rc;
@@ -2972,7 +3014,21 @@ int
SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
{
- return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+ int rc;
+ int tmp_rc;
+
+ rc = SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+
+ /* retry close in a worker thread if this one is interrupted */
+ if (rc == -EINTR) {
+ tmp_rc = smb2_handle_cancelled_close(tcon, persistent_fid,
+ volatile_fid);
+ if (tmp_rc)
+ cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n",
+ persistent_fid, tmp_rc);
+ }
+
+ return rc;
}
int
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 0abfde6d0b05..f264e1d36fe1 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1386,7 +1386,7 @@ struct smb2_oplock_break {
struct smb2_lease_break {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 44 */
- __le16 Reserved;
+ __le16 Epoch;
__le32 Flags;
__u8 LeaseKey[16];
__le32 CurrentLeaseState;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 71b2930b8e0b..d21a5fcc8d06 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -46,7 +46,8 @@ extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *);
extern int smb2_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
- struct smb_rqst *rqst);
+ struct TCP_Server_Info *,
+ struct smb_rqst *rqst);
extern struct mid_q_entry *smb2_setup_async_request(
struct TCP_Server_Info *server, struct smb_rqst *rqst);
extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
@@ -212,6 +213,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
const u64 persistent_fid, const u64 volatile_fid,
const __u8 oplock_level);
+extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon,
+ __u64 persistent_fid,
+ __u64 volatile_fid);
extern int smb2_handle_cancelled_mid(char *buffer,
struct TCP_Server_Info *server);
void smb2_cancelled_close_fid(struct work_struct *work);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 148d7942c796..387c88704c52 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -98,6 +98,61 @@ err:
return rc;
}
+
+static
+int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
+{
+ struct cifs_chan *chan;
+ struct cifs_ses *ses = NULL;
+ int i;
+ int rc = 0;
+
+ spin_lock(&cifs_tcp_ses_lock);
+
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id)
+ goto found;
+ }
+ }
+ cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n",
+ __func__, ses_id);
+ rc = -ENOENT;
+ goto out;
+
+found:
+ if (ses->binding) {
+ /*
+ * If we are in the process of binding a new channel
+ * to an existing session, use the master connection
+ * session key
+ */
+ memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE);
+ goto out;
+ }
+
+ /*
+ * Otherwise, use the channel key.
+ */
+
+ for (i = 0; i < ses->chan_count; i++) {
+ chan = ses->chans + i;
+ if (chan->server == server) {
+ memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE);
+ goto out;
+ }
+ }
+
+ cifs_dbg(VFS,
+ "%s: Could not find channel signing key for session 0x%llx\n",
+ __func__, ses_id);
+ rc = -ENOENT;
+
+out:
+ spin_unlock(&cifs_tcp_ses_lock);
+ return rc;
+}
+
static struct cifs_ses *
smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
{
@@ -328,21 +383,45 @@ generate_smb3signingkey(struct cifs_ses *ses,
{
int rc;
- rc = generate_key(ses, ptriplet->signing.label,
- ptriplet->signing.context, ses->smb3signingkey,
- SMB3_SIGN_KEY_SIZE);
- if (rc)
- return rc;
+ /*
+ * All channels use the same encryption/decryption keys but
+ * they have their own signing key.
+ *
+ * When we generate the keys, check if it is for a new channel
+ * (binding) in which case we only need to generate a signing
+ * key and store it in the channel as to not overwrite the
+ * master connection signing key stored in the session
+ */
- rc = generate_key(ses, ptriplet->encryption.label,
- ptriplet->encryption.context, ses->smb3encryptionkey,
- SMB3_SIGN_KEY_SIZE);
- if (rc)
- return rc;
+ if (ses->binding) {
+ rc = generate_key(ses, ptriplet->signing.label,
+ ptriplet->signing.context,
+ cifs_ses_binding_channel(ses)->signkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
+ } else {
+ rc = generate_key(ses, ptriplet->signing.label,
+ ptriplet->signing.context,
+ ses->smb3signingkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
- rc = generate_key(ses, ptriplet->decryption.label,
- ptriplet->decryption.context,
- ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+ memcpy(ses->chans[0].signkey, ses->smb3signingkey,
+ SMB3_SIGN_KEY_SIZE);
+
+ rc = generate_key(ses, ptriplet->encryption.label,
+ ptriplet->encryption.context,
+ ses->smb3encryptionkey,
+ SMB3_SIGN_KEY_SIZE);
+ rc = generate_key(ses, ptriplet->decryption.label,
+ ptriplet->decryption.context,
+ ses->smb3decryptionkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
+ }
if (rc)
return rc;
@@ -431,21 +510,19 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
- struct cifs_ses *ses;
struct shash_desc *shash = &server->secmech.sdesccmacaes->shash;
struct smb_rqst drqst;
+ u8 key[SMB3_SIGN_KEY_SIZE];
- ses = smb2_find_smb_ses(server, shdr->SessionId);
- if (!ses) {
- cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
+ rc = smb2_get_sign_key(shdr->SessionId, server, key);
+ if (rc)
return 0;
- }
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
rc = crypto_shash_setkey(server->secmech.cmacaes,
- ses->smb3signingkey, SMB2_CMACAES_SIZE);
+ key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
return rc;
@@ -494,16 +571,25 @@ static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
int rc = 0;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_sync_hdr *shdr;
+ struct smb2_sess_setup_req *ssr;
+ bool is_binding;
+ bool is_signed;
- if (!(shdr->Flags & SMB2_FLAGS_SIGNED) ||
- server->tcpStatus == CifsNeedNegotiate)
- return rc;
+ shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ ssr = (struct smb2_sess_setup_req *)shdr;
+
+ is_binding = shdr->Command == SMB2_SESSION_SETUP &&
+ (ssr->Flags & SMB2_SESSION_REQ_FLAG_BINDING);
+ is_signed = shdr->Flags & SMB2_FLAGS_SIGNED;
- if (!server->session_estab) {
+ if (!is_signed)
+ return 0;
+ if (server->tcpStatus == CifsNeedNegotiate)
+ return 0;
+ if (!is_binding && !server->session_estab) {
strncpy(shdr->Signature, "BSRSPYL", 8);
- return rc;
+ return 0;
}
rc = server->ops->calc_signature(rqst, server);
@@ -610,18 +696,18 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
}
static int
-smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
- struct mid_q_entry **mid)
+smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
+ struct smb2_sync_hdr *shdr, struct mid_q_entry **mid)
{
- if (ses->server->tcpStatus == CifsExiting)
+ if (server->tcpStatus == CifsExiting)
return -ENOENT;
- if (ses->server->tcpStatus == CifsNeedReconnect) {
+ if (server->tcpStatus == CifsNeedReconnect) {
cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
return -EAGAIN;
}
- if (ses->server->tcpStatus == CifsNeedNegotiate &&
+ if (server->tcpStatus == CifsNeedNegotiate &&
shdr->Command != SMB2_NEGOTIATE)
return -EAGAIN;
@@ -638,11 +724,11 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
/* else ok - we are shutting down the session */
}
- *mid = smb2_mid_entry_alloc(shdr, ses->server);
+ *mid = smb2_mid_entry_alloc(shdr, server);
if (*mid == NULL)
return -ENOMEM;
spin_lock(&GlobalMid_Lock);
- list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
+ list_add_tail(&(*mid)->qhead, &server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
return 0;
@@ -675,24 +761,25 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
}
struct mid_q_entry *
-smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
+ struct smb_rqst *rqst)
{
int rc;
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
- smb2_seq_num_into_buf(ses->server, shdr);
+ smb2_seq_num_into_buf(server, shdr);
- rc = smb2_get_mid_entry(ses, shdr, &mid);
+ rc = smb2_get_mid_entry(ses, server, shdr, &mid);
if (rc) {
- revert_current_mid_from_hdr(ses->server, shdr);
+ revert_current_mid_from_hdr(server, shdr);
return ERR_PTR(rc);
}
- rc = smb2_sign_rqst(rqst, ses->server);
+ rc = smb2_sign_rqst(rqst, server);
if (rc) {
- revert_current_mid_from_hdr(ses->server, shdr);
+ revert_current_mid_from_hdr(server, shdr);
cifs_delete_mid(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 3c91fa97c9a8..5b1b97e9e0c9 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1069,7 +1069,7 @@ static int smbd_post_send_data(
if (n_vec > SMBDIRECT_MAX_SGE) {
cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
- return -ENOMEM;
+ return -EINVAL;
}
sg_init_table(sgl, n_vec);
@@ -1476,6 +1476,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
info->transport_status = SMBD_DESTROYED;
destroy_workqueue(info->workqueue);
+ log_rdma_event(INFO, "rdma session destroyed\n");
kfree(info);
}
@@ -1505,8 +1506,9 @@ create_conn:
log_rdma_event(INFO, "creating rdma session\n");
server->smbd_conn = smbd_get_connection(
server, (struct sockaddr *) &server->dstaddr);
- log_rdma_event(INFO, "created rdma session info=%p\n",
- server->smbd_conn);
+
+ if (server->smbd_conn)
+ cifs_dbg(VFS, "RDMA transport re-established\n");
return server->smbd_conn ? 0 : -ENOENT;
}
@@ -1970,7 +1972,7 @@ read_rfc1002_done:
if (info->transport_status != SMBD_CONNECTED) {
log_read(ERR, "disconnected\n");
- return 0;
+ return -ECONNABORTED;
}
goto again;
@@ -2269,12 +2271,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
int rc;
list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
- if (smbdirect_mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgl,
- smbdirect_mr->sgl_count,
- smbdirect_mr->dir);
- else if (smbdirect_mr->state == MR_ERROR) {
+ if (smbdirect_mr->state == MR_ERROR) {
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2602,11 +2599,20 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
*/
smbdirect_mr->state = MR_INVALIDATED;
- /*
- * Schedule the work to do MR recovery for future I/Os
- * MR recovery is slow and we don't want it to block the current I/O
- */
- queue_work(info->workqueue, &info->mr_recovery_work);
+ if (smbdirect_mr->state == MR_INVALIDATED) {
+ ib_dma_unmap_sg(
+ info->id->device, smbdirect_mr->sgl,
+ smbdirect_mr->sgl_count,
+ smbdirect_mr->dir);
+ smbdirect_mr->state = MR_READY;
+ if (atomic_inc_return(&info->mr_ready_count) == 1)
+ wake_up_interruptible(&info->wait_mr);
+ } else
+ /*
+ * Schedule the work to do MR recovery for future I/Os MR
+ * recovery is slow and don't want it to block current I/O
+ */
+ queue_work(info->workqueue, &info->mr_recovery_work);
done:
if (atomic_dec_and_test(&info->mr_used_count))
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ca3de62688d6..3d2e11f85cba 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -93,8 +93,14 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
__u16 smb_cmd = le16_to_cpu(midEntry->command);
unsigned long now;
unsigned long roundtrip_time;
- struct TCP_Server_Info *server = midEntry->server;
#endif
+ struct TCP_Server_Info *server = midEntry->server;
+
+ if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
+ midEntry->mid_state == MID_RESPONSE_RECEIVED &&
+ server->ops->handle_cancelled_mid)
+ server->ops->handle_cancelled_mid(midEntry->resp_buf, server);
+
midEntry->mid_state = MID_FREE;
atomic_dec(&midCount);
if (midEntry->large_buf)
@@ -319,8 +325,11 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
int val = 1;
__be32 rfc1002_marker;
- if (cifs_rdma_enabled(server) && server->smbd_conn) {
- rc = smbd_send(server, num_rqst, rqst);
+ if (cifs_rdma_enabled(server)) {
+ /* return -EAGAIN when connecting or reconnecting */
+ rc = -EAGAIN;
+ if (server->smbd_conn)
+ rc = smbd_send(server, num_rqst, rqst);
goto smbd_done;
}
@@ -927,7 +936,8 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
}
struct mid_q_entry *
-cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored,
+ struct smb_rqst *rqst)
{
int rc;
struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
@@ -999,7 +1009,18 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- server = ses->server;
+ if (!ses->binding) {
+ uint index = 0;
+
+ if (ses->chan_count > 1) {
+ index = (uint)atomic_inc_return(&ses->chan_seq);
+ index %= ses->chan_count;
+ }
+ server = ses->chans[index].server;
+ } else {
+ server = cifs_ses_server(ses);
+ }
+
if (server->tcpStatus == CifsExiting)
return -ENOENT;
@@ -1044,7 +1065,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
for (i = 0; i < num_rqst; i++) {
- midQ[i] = server->ops->setup_request(ses, &rqst[i]);
+ midQ[i] = server->ops->setup_request(ses, server, &rqst[i]);
if (IS_ERR(midQ[i])) {
revert_current_mid(server, i);
for (j = 0; j < i; j++)
@@ -1119,8 +1140,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
+ midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
- midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
@@ -1287,7 +1308,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = allocate_mid(ses, in_buf, &midQ);
if (rc) {
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
/* Update # of requests on wire to server */
add_credits(server, &credits, 0);
return rc;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index b7f9ffa1d5f1..aaad4ca1217e 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -48,8 +48,8 @@
#define elf_prstatus compat_elf_prstatus
#define elf_prpsinfo compat_elf_prpsinfo
-#undef ns_to_timeval
-#define ns_to_timeval ns_to_old_timeval32
+#undef ns_to_kernel_old_timeval
+#define ns_to_kernel_old_timeval ns_to_old_timeval32
/*
* To use this file, asm/elf.h must define compat_elf_check_arch.
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a7ec2d3dff92..9ae90d728c0f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -11,8 +11,6 @@
* ioctls.
*/
-#include <linux/joystick.h>
-
#include <linux/types.h>
#include <linux/compat.h>
#include <linux/kernel.h>
@@ -27,13 +25,9 @@
#include <linux/file.h>
#include <linux/ppp-ioctl.h>
#include <linux/if_pppox.h>
-#include <linux/mtio.h>
#include <linux/tty.h>
#include <linux/vt_kern.h>
-#include <linux/raw.h>
#include <linux/blkdev.h>
-#include <linux/rtc.h>
-#include <linux/pci.h>
#include <linux/serial.h>
#include <linux/ctype.h>
#include <linux/syscalls.h>
@@ -42,13 +36,6 @@
#include "internal.h"
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_sock.h>
-#include <net/bluetooth/rfcomm.h>
-
-#include <linux/capi.h>
-#include <linux/gigaset_dev.h>
-
#ifdef CONFIG_BLOCK
#include <linux/cdrom.h>
#include <linux/fd.h>
@@ -60,448 +47,11 @@
#include <linux/uaccess.h>
#include <linux/watchdog.h>
-#include <linux/soundcard.h>
-
#include <linux/hiddev.h>
#include <linux/sort.h>
-#ifdef CONFIG_SPARC
-#include <linux/fb.h>
-#include <asm/fbio.h>
-#endif
-
-#define convert_in_user(srcptr, dstptr) \
-({ \
- typeof(*srcptr) val; \
- \
- get_user(val, srcptr) || put_user(val, dstptr); \
-})
-
-static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- int err;
-
- err = security_file_ioctl(file, cmd, arg);
- if (err)
- return err;
-
- return vfs_ioctl(file, cmd, arg);
-}
-
-#ifdef CONFIG_BLOCK
-typedef struct sg_io_hdr32 {
- compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
- compat_int_t dxfer_direction; /* [i] data transfer direction */
- unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */
- unsigned char mx_sb_len; /* [i] max length to write to sbp */
- unsigned short iovec_count; /* [i] 0 implies no scatter gather */
- compat_uint_t dxfer_len; /* [i] byte count of data transfer */
- compat_uint_t dxferp; /* [i], [*io] points to data transfer memory
- or scatter gather list */
- compat_uptr_t cmdp; /* [i], [*i] points to command to perform */
- compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */
- compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */
- compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */
- compat_int_t pack_id; /* [i->o] unused internally (normally) */
- compat_uptr_t usr_ptr; /* [i->o] unused internally */
- unsigned char status; /* [o] scsi status */
- unsigned char masked_status; /* [o] shifted, masked scsi status */
- unsigned char msg_status; /* [o] messaging level data (optional) */
- unsigned char sb_len_wr; /* [o] byte count actually written to sbp */
- unsigned short host_status; /* [o] errors from host adapter */
- unsigned short driver_status; /* [o] errors from software driver */
- compat_int_t resid; /* [o] dxfer_len - actual_transferred */
- compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */
- compat_uint_t info; /* [o] auxiliary information */
-} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */
-
-typedef struct sg_iovec32 {
- compat_uint_t iov_base;
- compat_uint_t iov_len;
-} sg_iovec32_t;
-
-static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
-{
- sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
- sg_iovec32_t __user *iov32 = dxferp;
- int i;
-
- for (i = 0; i < iovec_count; i++) {
- u32 base, len;
-
- if (get_user(base, &iov32[i].iov_base) ||
- get_user(len, &iov32[i].iov_len) ||
- put_user(compat_ptr(base), &iov[i].iov_base) ||
- put_user(len, &iov[i].iov_len))
- return -EFAULT;
- }
-
- if (put_user(iov, &sgio->dxferp))
- return -EFAULT;
- return 0;
-}
-
-static int sg_ioctl_trans(struct file *file, unsigned int cmd,
- sg_io_hdr32_t __user *sgio32)
-{
- sg_io_hdr_t __user *sgio;
- u16 iovec_count;
- u32 data;
- void __user *dxferp;
- int err;
- int interface_id;
-
- if (get_user(interface_id, &sgio32->interface_id))
- return -EFAULT;
- if (interface_id != 'S')
- return do_ioctl(file, cmd, (unsigned long)sgio32);
-
- if (get_user(iovec_count, &sgio32->iovec_count))
- return -EFAULT;
-
- {
- void __user *top = compat_alloc_user_space(0);
- void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
- (iovec_count * sizeof(sg_iovec_t)));
- if (new > top)
- return -EINVAL;
-
- sgio = new;
- }
-
- /* Ok, now construct. */
- if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
- (2 * sizeof(int)) +
- (2 * sizeof(unsigned char)) +
- (1 * sizeof(unsigned short)) +
- (1 * sizeof(unsigned int))))
- return -EFAULT;
-
- if (get_user(data, &sgio32->dxferp))
- return -EFAULT;
- dxferp = compat_ptr(data);
- if (iovec_count) {
- if (sg_build_iovec(sgio, dxferp, iovec_count))
- return -EFAULT;
- } else {
- if (put_user(dxferp, &sgio->dxferp))
- return -EFAULT;
- }
-
- {
- unsigned char __user *cmdp;
- unsigned char __user *sbp;
-
- if (get_user(data, &sgio32->cmdp))
- return -EFAULT;
- cmdp = compat_ptr(data);
-
- if (get_user(data, &sgio32->sbp))
- return -EFAULT;
- sbp = compat_ptr(data);
-
- if (put_user(cmdp, &sgio->cmdp) ||
- put_user(sbp, &sgio->sbp))
- return -EFAULT;
- }
-
- if (copy_in_user(&sgio->timeout, &sgio32->timeout,
- 3 * sizeof(int)))
- return -EFAULT;
-
- if (get_user(data, &sgio32->usr_ptr))
- return -EFAULT;
- if (put_user(compat_ptr(data), &sgio->usr_ptr))
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long) sgio);
-
- if (err >= 0) {
- void __user *datap;
-
- if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
- sizeof(int)) ||
- get_user(datap, &sgio->usr_ptr) ||
- put_user((u32)(unsigned long)datap,
- &sgio32->usr_ptr) ||
- copy_in_user(&sgio32->status, &sgio->status,
- (4 * sizeof(unsigned char)) +
- (2 * sizeof(unsigned short)) +
- (3 * sizeof(int))))
- err = -EFAULT;
- }
-
- return err;
-}
-
-struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
- char req_state;
- char orphan;
- char sg_io_owned;
- char problem;
- int pack_id;
- compat_uptr_t usr_ptr;
- unsigned int duration;
- int unused;
-};
-
-static int sg_grt_trans(struct file *file,
- unsigned int cmd, struct compat_sg_req_info __user *o)
-{
- int err, i;
- sg_req_info_t __user *r;
- r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
- err = do_ioctl(file, cmd, (unsigned long)r);
- if (err < 0)
- return err;
- for (i = 0; i < SG_MAX_QUEUE; i++) {
- void __user *ptr;
- int d;
-
- if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) ||
- get_user(ptr, &r[i].usr_ptr) ||
- get_user(d, &r[i].duration) ||
- put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) ||
- put_user(d, &o[i].duration))
- return -EFAULT;
- }
- return err;
-}
-#endif /* CONFIG_BLOCK */
-
-struct sock_fprog32 {
- unsigned short len;
- compat_caddr_t filter;
-};
-
-#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32)
-#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
-
-static int ppp_sock_fprog_ioctl_trans(struct file *file,
- unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
-{
- struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
- void __user *fptr64;
- u32 fptr32;
- u16 flen;
-
- if (get_user(flen, &u_fprog32->len) ||
- get_user(fptr32, &u_fprog32->filter))
- return -EFAULT;
-
- fptr64 = compat_ptr(fptr32);
-
- if (put_user(flen, &u_fprog64->len) ||
- put_user(fptr64, &u_fprog64->filter))
- return -EFAULT;
-
- if (cmd == PPPIOCSPASS32)
- cmd = PPPIOCSPASS;
- else
- cmd = PPPIOCSACTIVE;
-
- return do_ioctl(file, cmd, (unsigned long) u_fprog64);
-}
-
-struct ppp_option_data32 {
- compat_caddr_t ptr;
- u32 length;
- compat_int_t transmit;
-};
-#define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32)
-
-struct ppp_idle32 {
- compat_time_t xmit_idle;
- compat_time_t recv_idle;
-};
-#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32)
-
-static int ppp_gidle(struct file *file, unsigned int cmd,
- struct ppp_idle32 __user *idle32)
-{
- struct ppp_idle __user *idle;
- __kernel_time_t xmit, recv;
- int err;
-
- idle = compat_alloc_user_space(sizeof(*idle));
-
- err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
-
- if (!err) {
- if (get_user(xmit, &idle->xmit_idle) ||
- get_user(recv, &idle->recv_idle) ||
- put_user(xmit, &idle32->xmit_idle) ||
- put_user(recv, &idle32->recv_idle))
- err = -EFAULT;
- }
- return err;
-}
-
-static int ppp_scompress(struct file *file, unsigned int cmd,
- struct ppp_option_data32 __user *odata32)
-{
- struct ppp_option_data __user *odata;
- __u32 data;
- void __user *datap;
-
- odata = compat_alloc_user_space(sizeof(*odata));
-
- if (get_user(data, &odata32->ptr))
- return -EFAULT;
-
- datap = compat_ptr(data);
- if (put_user(datap, &odata->ptr))
- return -EFAULT;
-
- if (copy_in_user(&odata->length, &odata32->length,
- sizeof(__u32) + sizeof(int)))
- return -EFAULT;
-
- return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
-}
-
-#ifdef CONFIG_BLOCK
-struct mtget32 {
- compat_long_t mt_type;
- compat_long_t mt_resid;
- compat_long_t mt_dsreg;
- compat_long_t mt_gstat;
- compat_long_t mt_erreg;
- compat_daddr_t mt_fileno;
- compat_daddr_t mt_blkno;
-};
-#define MTIOCGET32 _IOR('m', 2, struct mtget32)
-
-struct mtpos32 {
- compat_long_t mt_blkno;
-};
-#define MTIOCPOS32 _IOR('m', 3, struct mtpos32)
-
-static int mt_ioctl_trans(struct file *file,
- unsigned int cmd, void __user *argp)
-{
- /* NULL initialization to make gcc shut up */
- struct mtget __user *get = NULL;
- struct mtget32 __user *umget32;
- struct mtpos __user *pos = NULL;
- struct mtpos32 __user *upos32;
- unsigned long kcmd;
- void *karg;
- int err = 0;
-
- switch(cmd) {
- case MTIOCPOS32:
- kcmd = MTIOCPOS;
- pos = compat_alloc_user_space(sizeof(*pos));
- karg = pos;
- break;
- default: /* MTIOCGET32 */
- kcmd = MTIOCGET;
- get = compat_alloc_user_space(sizeof(*get));
- karg = get;
- break;
- }
- if (karg == NULL)
- return -EFAULT;
- err = do_ioctl(file, kcmd, (unsigned long)karg);
- if (err)
- return err;
- switch (cmd) {
- case MTIOCPOS32:
- upos32 = argp;
- err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
- break;
- case MTIOCGET32:
- umget32 = argp;
- err = convert_in_user(&get->mt_type, &umget32->mt_type);
- err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
- err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
- err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
- err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
- err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
- err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
- break;
- }
- return err ? -EFAULT: 0;
-}
-
-#endif /* CONFIG_BLOCK */
-
-/* Bluetooth ioctls */
-#define HCIUARTSETPROTO _IOW('U', 200, int)
-#define HCIUARTGETPROTO _IOR('U', 201, int)
-#define HCIUARTGETDEVICE _IOR('U', 202, int)
-#define HCIUARTSETFLAGS _IOW('U', 203, int)
-#define HCIUARTGETFLAGS _IOR('U', 204, int)
-
-#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
-#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t)
-#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
-#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
-
-static int rtc_ioctl(struct file *file,
- unsigned cmd, void __user *argp)
-{
- unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
- int ret;
-
- if (valp == NULL)
- return -EFAULT;
- switch (cmd) {
- case RTC_IRQP_READ32:
- case RTC_EPOCH_READ32:
- ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
- RTC_IRQP_READ : RTC_EPOCH_READ,
- (unsigned long)valp);
- if (ret)
- return ret;
- return convert_in_user(valp, (unsigned int __user *)argp);
- case RTC_IRQP_SET32:
- return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
- case RTC_EPOCH_SET32:
- return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
- }
-
- return -ENOIOCTLCMD;
-}
-
-/* on ia32 l_start is on a 32-bit boundary */
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
-struct space_resv_32 {
- __s16 l_type;
- __s16 l_whence;
- __s64 l_start __attribute__((packed));
- /* len == 0 means until end of file */
- __s64 l_len __attribute__((packed));
- __s32 l_sysid;
- __u32 l_pid;
- __s32 l_pad[4]; /* reserve area */
-};
-
-#define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32)
-#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32)
-
-/* just account for different alignment */
-static int compat_ioctl_preallocate(struct file *file,
- struct space_resv_32 __user *p32)
-{
- struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
-
- if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
- copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) ||
- copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) ||
- copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) ||
- copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) ||
- copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) ||
- copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32)))
- return -EFAULT;
-
- return ioctl_preallocate(file, p);
-}
-#endif
-
/*
* simple reversible transform to make our table more evenly
* distributed after sorting.
@@ -509,33 +59,7 @@ static int compat_ioctl_preallocate(struct file *file,
#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd),
-/* ioctl should not be warned about even if it's not implemented.
- Valid reasons to use this:
- - It is implemented with ->compat_ioctl on some device, but programs
- call it on others too.
- - The ioctl is not implemented in the native kernel, but programs
- call it commonly anyways.
- Most other reasons are not valid. */
-#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
-
static unsigned int ioctl_pointer[] = {
-/* compatible ioctls first */
-/* Little t */
-COMPATIBLE_IOCTL(TIOCOUTQ)
-/* Little f */
-COMPATIBLE_IOCTL(FIOCLEX)
-COMPATIBLE_IOCTL(FIONCLEX)
-COMPATIBLE_IOCTL(FIOASYNC)
-COMPATIBLE_IOCTL(FIONBIO)
-COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
-COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
-/* 0x00 */
-COMPATIBLE_IOCTL(FIBMAP)
-COMPATIBLE_IOCTL(FIGETBSZ)
-/* 'X' - originally XFS but some now in the VFS */
-COMPATIBLE_IOCTL(FIFREEZE)
-COMPATIBLE_IOCTL(FITHAW)
-COMPATIBLE_IOCTL(FITRIM)
#ifdef CONFIG_BLOCK
/* Big S */
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -547,43 +71,10 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
#endif
-/* Big V (don't complain on serial console) */
-IGNORE_IOCTL(VT_OPENQRY)
-IGNORE_IOCTL(VT_GETMODE)
-/* Little p (/dev/rtc, /dev/envctrl, etc.) */
-COMPATIBLE_IOCTL(RTC_AIE_ON)
-COMPATIBLE_IOCTL(RTC_AIE_OFF)
-COMPATIBLE_IOCTL(RTC_UIE_ON)
-COMPATIBLE_IOCTL(RTC_UIE_OFF)
-COMPATIBLE_IOCTL(RTC_PIE_ON)
-COMPATIBLE_IOCTL(RTC_PIE_OFF)
-COMPATIBLE_IOCTL(RTC_WIE_ON)
-COMPATIBLE_IOCTL(RTC_WIE_OFF)
-COMPATIBLE_IOCTL(RTC_ALM_SET)
-COMPATIBLE_IOCTL(RTC_ALM_READ)
-COMPATIBLE_IOCTL(RTC_RD_TIME)
-COMPATIBLE_IOCTL(RTC_SET_TIME)
-COMPATIBLE_IOCTL(RTC_WKALM_SET)
-COMPATIBLE_IOCTL(RTC_WKALM_RD)
-/*
- * These two are only for the sbus rtc driver, but
- * hwclock tries them on every rtc device first when
- * running on sparc. On other architectures the entries
- * are useless but harmless.
- */
-COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */
-COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
-/* Little m */
-COMPATIBLE_IOCTL(MTIOCTOP)
-/* Socket level stuff */
-COMPATIBLE_IOCTL(FIOQSIZE)
#ifdef CONFIG_BLOCK
-/* md calls this on random blockdevs */
-IGNORE_IOCTL(RAID_VERSION)
-/* qemu/qemu-img might call these two on plain files for probing */
-IGNORE_IOCTL(CDROM_DRIVE_STATUS)
-IGNORE_IOCTL(FDGETPRM32)
/* SG stuff */
+COMPATIBLE_IOCTL(SG_IO)
+COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
COMPATIBLE_IOCTL(SG_EMULATED_HOST)
@@ -607,314 +98,6 @@ COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
#endif
-/* PPP stuff */
-COMPATIBLE_IOCTL(PPPIOCGFLAGS)
-COMPATIBLE_IOCTL(PPPIOCSFLAGS)
-COMPATIBLE_IOCTL(PPPIOCGASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCSASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCGUNIT)
-COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCGMRU)
-COMPATIBLE_IOCTL(PPPIOCSMRU)
-COMPATIBLE_IOCTL(PPPIOCSMAXCID)
-COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCXFERUNIT)
-/* PPPIOCSCOMPRESS is translated */
-COMPATIBLE_IOCTL(PPPIOCGNPMODE)
-COMPATIBLE_IOCTL(PPPIOCSNPMODE)
-COMPATIBLE_IOCTL(PPPIOCGDEBUG)
-COMPATIBLE_IOCTL(PPPIOCSDEBUG)
-/* PPPIOCSPASS is translated */
-/* PPPIOCSACTIVE is translated */
-/* PPPIOCGIDLE is translated */
-COMPATIBLE_IOCTL(PPPIOCNEWUNIT)
-COMPATIBLE_IOCTL(PPPIOCATTACH)
-COMPATIBLE_IOCTL(PPPIOCDETACH)
-COMPATIBLE_IOCTL(PPPIOCSMRRU)
-COMPATIBLE_IOCTL(PPPIOCCONNECT)
-COMPATIBLE_IOCTL(PPPIOCDISCONN)
-COMPATIBLE_IOCTL(PPPIOCATTCHAN)
-COMPATIBLE_IOCTL(PPPIOCGCHAN)
-COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
-/* Big A */
-/* sparc only */
-/* Big Q for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE)
-COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS)
-COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL)
-COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE)
-/* Big T for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE)
-COMPATIBLE_IOCTL(SNDCTL_TMR_START)
-COMPATIBLE_IOCTL(SNDCTL_TMR_STOP)
-COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE)
-COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO)
-COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE)
-COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME)
-COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT)
-/* Little m for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME)
-COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE)
-COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD)
-/* Big P for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_DSP_RESET)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED)
-COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS)
-COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER)
-COMPATIBLE_IOCTL(SNDCTL_DSP_POST)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR)
-/* SNDCTL_DSP_MAPINBUF, XXX needs translation */
-/* SNDCTL_DSP_MAPOUTBUF, XXX needs translation */
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY)
-COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER)
-/* Big C for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_COPR_RESET)
-COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE)
-COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA)
-COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RUN)
-COMPATIBLE_IOCTL(SNDCTL_COPR_HALT)
-COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG)
-/* Big M for sound/OSS */
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
-/* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */
-/* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
-/* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */
-/* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC)
-COMPATIBLE_IOCTL(SOUND_MIXER_INFO)
-COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO)
-COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS)
-COMPATIBLE_IOCTL(SOUND_MIXER_AGC)
-COMPATIBLE_IOCTL(SOUND_MIXER_3DSE)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
-COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
-COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
-COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* Raw devices */
-COMPATIBLE_IOCTL(RAW_SETBIND)
-COMPATIBLE_IOCTL(RAW_GETBIND)
-/* Watchdog */
-COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
-COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
-COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
-COMPATIBLE_IOCTL(WDIOC_GETTEMP)
-COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
-COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
-COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
-COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
-COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT)
-COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT)
-/* Big R */
-COMPATIBLE_IOCTL(RNDGETENTCNT)
-COMPATIBLE_IOCTL(RNDADDTOENTCNT)
-COMPATIBLE_IOCTL(RNDGETPOOL)
-COMPATIBLE_IOCTL(RNDADDENTROPY)
-COMPATIBLE_IOCTL(RNDZAPENTCNT)
-COMPATIBLE_IOCTL(RNDCLEARPOOL)
-/* Bluetooth */
-COMPATIBLE_IOCTL(HCIDEVUP)
-COMPATIBLE_IOCTL(HCIDEVDOWN)
-COMPATIBLE_IOCTL(HCIDEVRESET)
-COMPATIBLE_IOCTL(HCIDEVRESTAT)
-COMPATIBLE_IOCTL(HCIGETDEVLIST)
-COMPATIBLE_IOCTL(HCIGETDEVINFO)
-COMPATIBLE_IOCTL(HCIGETCONNLIST)
-COMPATIBLE_IOCTL(HCIGETCONNINFO)
-COMPATIBLE_IOCTL(HCIGETAUTHINFO)
-COMPATIBLE_IOCTL(HCISETRAW)
-COMPATIBLE_IOCTL(HCISETSCAN)
-COMPATIBLE_IOCTL(HCISETAUTH)
-COMPATIBLE_IOCTL(HCISETENCRYPT)
-COMPATIBLE_IOCTL(HCISETPTYPE)
-COMPATIBLE_IOCTL(HCISETLINKPOL)
-COMPATIBLE_IOCTL(HCISETLINKMODE)
-COMPATIBLE_IOCTL(HCISETACLMTU)
-COMPATIBLE_IOCTL(HCISETSCOMTU)
-COMPATIBLE_IOCTL(HCIBLOCKADDR)
-COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
-COMPATIBLE_IOCTL(HCIINQUIRY)
-COMPATIBLE_IOCTL(HCIUARTSETPROTO)
-COMPATIBLE_IOCTL(HCIUARTGETPROTO)
-COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
-COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
-COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
-COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
-COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
-COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
-COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
-COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
-/* CAPI */
-COMPATIBLE_IOCTL(CAPI_REGISTER)
-COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
-COMPATIBLE_IOCTL(CAPI_GET_VERSION)
-COMPATIBLE_IOCTL(CAPI_GET_SERIAL)
-COMPATIBLE_IOCTL(CAPI_GET_PROFILE)
-COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD)
-COMPATIBLE_IOCTL(CAPI_GET_ERRCODE)
-COMPATIBLE_IOCTL(CAPI_INSTALLED)
-COMPATIBLE_IOCTL(CAPI_GET_FLAGS)
-COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
-COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
-COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
-COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
-/* Misc. */
-COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */
-COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */
-COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
-COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
-COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
-COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* hiddev */
-COMPATIBLE_IOCTL(HIDIOCGVERSION)
-COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
-COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
-COMPATIBLE_IOCTL(HIDIOCGSTRING)
-COMPATIBLE_IOCTL(HIDIOCINITREPORT)
-COMPATIBLE_IOCTL(HIDIOCGREPORT)
-COMPATIBLE_IOCTL(HIDIOCSREPORT)
-COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
-COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
-COMPATIBLE_IOCTL(HIDIOCGUSAGE)
-COMPATIBLE_IOCTL(HIDIOCSUSAGE)
-COMPATIBLE_IOCTL(HIDIOCGUCODE)
-COMPATIBLE_IOCTL(HIDIOCGFLAG)
-COMPATIBLE_IOCTL(HIDIOCSFLAG)
-COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
-COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
-/* joystick */
-COMPATIBLE_IOCTL(JSIOCGVERSION)
-COMPATIBLE_IOCTL(JSIOCGAXES)
-COMPATIBLE_IOCTL(JSIOCGBUTTONS)
-COMPATIBLE_IOCTL(JSIOCGNAME(0))
-
-/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
- but we don't want warnings on other file systems. So declare
- them as compatible here. */
-#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2])
-#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2])
-
-IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
-IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
-
-#ifdef CONFIG_SPARC
-/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
-IGNORE_IOCTL(FBIOGTYPE)
-IGNORE_IOCTL(FBIOSATTR)
-IGNORE_IOCTL(FBIOGATTR)
-IGNORE_IOCTL(FBIOSVIDEO)
-IGNORE_IOCTL(FBIOGVIDEO)
-IGNORE_IOCTL(FBIOSCURPOS)
-IGNORE_IOCTL(FBIOGCURPOS)
-IGNORE_IOCTL(FBIOGCURMAX)
-IGNORE_IOCTL(FBIOPUTCMAP32)
-IGNORE_IOCTL(FBIOGETCMAP32)
-IGNORE_IOCTL(FBIOSCURSOR32)
-IGNORE_IOCTL(FBIOGCURSOR32)
-#endif
};
/*
@@ -927,51 +110,12 @@ IGNORE_IOCTL(FBIOGCURSOR32)
static long do_ioctl_trans(unsigned int cmd,
unsigned long arg, struct file *file)
{
- void __user *argp = compat_ptr(arg);
-
- switch (cmd) {
- case PPPIOCGIDLE32:
- return ppp_gidle(file, cmd, argp);
- case PPPIOCSCOMPRESS32:
- return ppp_scompress(file, cmd, argp);
- case PPPIOCSPASS32:
- case PPPIOCSACTIVE32:
- return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
-#ifdef CONFIG_BLOCK
- case SG_IO:
- return sg_ioctl_trans(file, cmd, argp);
- case SG_GET_REQUEST_TABLE:
- return sg_grt_trans(file, cmd, argp);
- case MTIOCGET32:
- case MTIOCPOS32:
- return mt_ioctl_trans(file, cmd, argp);
-#endif
- /* Not implemented in the native kernel */
- case RTC_IRQP_READ32:
- case RTC_IRQP_SET32:
- case RTC_EPOCH_READ32:
- case RTC_EPOCH_SET32:
- return rtc_ioctl(file, cmd, argp);
- }
-
- /*
- * These take an integer instead of a pointer as 'arg',
- * so we must not do a compat_ptr() translation.
- */
- switch (cmd) {
- /* RAID */
- case HOT_REMOVE_DISK:
- case HOT_ADD_DISK:
- case SET_DISK_FAULTY:
- case SET_BITMAP_FILE:
- return vfs_ioctl(file, cmd, arg);
- }
-
return -ENOIOCTLCMD;
}
static int compat_ioctl_check_table(unsigned int xcmd)
{
+#ifdef CONFIG_BLOCK
int i;
const int max = ARRAY_SIZE(ioctl_pointer) - 1;
@@ -990,6 +134,9 @@ static int compat_ioctl_check_table(unsigned int xcmd)
i--;
return ioctl_pointer[i] == xcmd;
+#else
+ return 0;
+#endif
}
COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
@@ -1006,20 +153,40 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (error)
goto out_fput;
- /*
- * To allow the compat_ioctl handlers to be self contained
- * we need to check the common ioctls here first.
- * Just handle them with the standard handlers below.
- */
switch (cmd) {
+ /* these are never seen by ->ioctl(), no argument or int argument */
case FIOCLEX:
case FIONCLEX:
+ case FIFREEZE:
+ case FITHAW:
+ case FICLONE:
+ goto do_ioctl;
+ /* these are never seen by ->ioctl(), pointer argument */
case FIONBIO:
case FIOASYNC:
case FIOQSIZE:
- break;
-
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+ case FS_IOC_FIEMAP:
+ case FIGETBSZ:
+ case FICLONERANGE:
+ case FIDEDUPERANGE:
+ goto found_handler;
+ /*
+ * The next group is the stuff handled inside file_ioctl().
+ * For regular files these never reach ->ioctl(); for
+ * devices, sockets, etc. they do and one (FIONREAD) is
+ * even accepted in some cases. In all those cases
+ * argument has the same type, so we can handle these
+ * here, shunting them towards do_vfs_ioctl().
+ * ->compat_ioctl() will never see any of those.
+ */
+ /* pointer argument, never actually handled by ->ioctl() */
+ case FIBMAP:
+ goto found_handler;
+ /* handled by some ->ioctl(); always a pointer to int */
+ case FIONREAD:
+ goto found_handler;
+ /* these two get messy on amd64 due to alignment differences */
+#if defined(CONFIG_X86_64)
case FS_IOC_RESVSP_32:
case FS_IOC_RESVSP64_32:
error = compat_ioctl_preallocate(f.file, compat_ptr(arg));
@@ -1027,23 +194,9 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
#else
case FS_IOC_RESVSP:
case FS_IOC_RESVSP64:
- error = ioctl_preallocate(f.file, compat_ptr(arg));
- goto out_fput;
+ goto found_handler;
#endif
- case FICLONE:
- case FICLONERANGE:
- case FIDEDUPERANGE:
- case FS_IOC_FIEMAP:
- goto do_ioctl;
-
- case FIBMAP:
- case FIGETBSZ:
- case FIONREAD:
- if (S_ISREG(file_inode(f.file)->i_mode))
- break;
- /*FALL THROUGH*/
-
default:
if (f.file->f_op->compat_ioctl) {
error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
diff --git a/fs/dax.c b/fs/dax.c
index 2cc43cd914eb..1f1f0201cad1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1091,7 +1091,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
@@ -1248,7 +1248,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
- struct iomap iomap = { 0 };
+ struct iomap iomap = { .type = IOMAP_HOLE };
+ struct iomap srcmap = { .type = IOMAP_HOLE };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -1293,7 +1294,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the file system block size to be equal the page size, which means
* that we never have to deal with more than a single extent here.
*/
- error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
if (iomap_errp)
*iomap_errp = error;
if (error) {
@@ -1472,7 +1473,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK;
- struct iomap iomap = { 0 };
+ struct iomap iomap = { .type = IOMAP_HOLE };
+ struct iomap srcmap = { .type = IOMAP_HOLE };
pgoff_t max_pgoff;
void *entry;
loff_t pos;
@@ -1547,7 +1549,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* to look up our filesystem block.
*/
pos = (loff_t)xas.xa_index << PAGE_SHIFT;
- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+ error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
+ &srcmap);
if (error)
goto unlock_entry;
diff --git a/fs/dcache.c b/fs/dcache.c
index e88cf0554e65..f7931b682a0d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1319,7 +1319,7 @@ resume:
if (!list_empty(&dentry->d_subdirs)) {
spin_unlock(&this_parent->d_lock);
- spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+ spin_release(&dentry->d_lock.dep_map, _RET_IP_);
this_parent = dentry;
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 87846aad594b..dede25247b81 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -420,20 +420,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_u8(const char *name, umode_t mode,
- struct dentry *parent, u8 *value)
+void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent,
+ u8 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8,
&fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -465,20 +456,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_u16(const char *name, umode_t mode,
- struct dentry *parent, u16 *value)
+void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent,
+ u16 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16,
&fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -556,20 +538,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_u64(const char *name, umode_t mode,
- struct dentry *parent, u64 *value)
+void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent,
+ u64 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64,
&fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
@@ -660,10 +633,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x8(const char *name, umode_t mode,
- struct dentry *parent, u8 *value)
+void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent,
+ u8 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8,
&fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -678,10 +651,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x16(const char *name, umode_t mode,
- struct dentry *parent, u16 *value)
+void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent,
+ u16 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16,
&fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -696,10 +669,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x32(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent,
+ u32 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32,
&fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -714,10 +687,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x64(const char *name, umode_t mode,
- struct dentry *parent, u64 *value)
+void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent,
+ u64 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64,
&fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -748,12 +721,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
- struct dentry *parent, size_t *value)
+void debugfs_create_size_t(const char *name, umode_t mode,
+ struct dentry *parent, size_t *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value,
- &fops_size_t, &fops_size_t_ro,
- &fops_size_t_wo);
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t,
+ &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -785,12 +757,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
- struct dentry *parent, atomic_t *value)
+void debugfs_create_atomic_t(const char *name, umode_t mode,
+ struct dentry *parent, atomic_t *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value,
- &fops_atomic_t, &fops_atomic_t_ro,
- &fops_atomic_t_wo);
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t,
+ &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9329ced91f1d..0ec4f270139f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -221,27 +221,6 @@ static inline struct page *dio_get_page(struct dio *dio,
}
/*
- * Warn about a page cache invalidation failure during a direct io write.
- */
-void dio_warn_stale_pagecache(struct file *filp)
-{
- static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
- char pathname[128];
- struct inode *inode = file_inode(filp);
- char *path;
-
- errseq_set(&inode->i_mapping->wb_err, -EIO);
- if (__ratelimit(&_rs)) {
- path = file_path(filp, pathname, sizeof(pathname));
- if (IS_ERR(path))
- path = "(unknown)";
- pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
- pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
- current->comm);
- }
-}
-
-/*
* dio_complete() - called when all DIO BIO I/O has been completed
*
* This drops i_dio_count, lets interested parties know that a DIO operation
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index feecb57defa7..5fb45d865ce5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -378,6 +378,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return rc;
switch (cmd) {
+ case FITRIM:
case FS_IOC32_GETFLAGS:
case FS_IOC32_SETFLAGS:
case FS_IOC32_GETVERSION:
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9d634d3a1845..74b0aaa7114c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,6 +3,7 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select LIBCRC32C
help
EROFS (Enhanced Read-Only File System) is a lightweight
read-only file system with modern designs (eg. page-sized
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 19f89f9fb10c..2890a67a1ded 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -73,7 +73,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool, GFP_KERNEL, false);
+ victim = erofs_allocpage(pagepool, GFP_KERNEL);
if (!victim)
return -ENOMEM;
victim->mapping = Z_EROFS_MAPPING_STAGING;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b1ee5654750d..385fa49c7749 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -11,6 +11,8 @@
#define EROFS_SUPER_OFFSET 1024
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
* be incompatible with this kernel version.
@@ -37,7 +39,6 @@ struct erofs_super_block {
__u8 uuid[16]; /* 128-bit uuid for volume */
__u8 volume_name[16]; /* volume name */
__le32 feature_incompat;
-
__u8 reserved2[44];
};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 544a453f3076..1ed5beff7d11 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -85,6 +85,7 @@ struct erofs_sb_info {
u8 uuid[16]; /* 128-bit uuid for volume */
u8 volume_name[16]; /* volume name */
+ u32 feature_compat;
u32 feature_incompat;
unsigned int mount_opt;
@@ -278,9 +279,7 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
-#ifdef CONFIG_EROFS_FS_ZIP
-extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
-#endif
+extern const struct address_space_operations z_erofs_aops;
/*
* Logical to physical block mapping, used by erofs_map_blocks()
@@ -382,7 +381,7 @@ int erofs_namei(struct inode *dir, struct qstr *name,
extern const struct file_operations erofs_dir_fops;
/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail);
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
#if (EROFS_PCPUBUF_NR_PAGES > 0)
void *erofs_get_pcpubuf(unsigned int pagenr);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0e369494f2f2..057e6d7b5b7f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -9,6 +9,7 @@
#include <linux/statfs.h>
#include <linux/parser.h>
#include <linux/seq_file.h>
+#include <linux/crc32c.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -46,6 +47,30 @@ void _erofs_info(struct super_block *sb, const char *function,
va_end(args);
}
+static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
+{
+ struct erofs_super_block *dsb;
+ u32 expected_crc, crc;
+
+ dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET,
+ EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL);
+ if (!dsb)
+ return -ENOMEM;
+
+ expected_crc = le32_to_cpu(dsb->checksum);
+ dsb->checksum = 0;
+ /* to allow for x86 boot sectors and other oddities. */
+ crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET);
+ kfree(dsb);
+
+ if (crc != expected_crc) {
+ erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+ return 0;
+}
+
static void erofs_inode_init_once(void *ptr)
{
struct erofs_inode *vi = ptr;
@@ -112,7 +137,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi = EROFS_SB(sb);
- data = kmap_atomic(page);
+ data = kmap(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
ret = -EINVAL;
@@ -121,6 +146,13 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
}
+ sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
+ if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+ ret = erofs_superblock_csum_verify(sb, data);
+ if (ret)
+ goto out;
+ }
+
blkszbits = dsb->blkszbits;
/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
if (blkszbits != LOG_BLOCK_SIZE) {
@@ -155,7 +187,7 @@ static int erofs_read_superblock(struct super_block *sb)
}
ret = 0;
out:
- kunmap_atomic(data);
+ kunmap(page);
put_page(page);
return ret;
}
@@ -566,9 +598,6 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",cache_strategy=readahead");
} else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
seq_puts(seq, ",cache_strategy=readaround");
- } else {
- seq_puts(seq, ",cache_strategy=(unknown)");
- DBG_BUGON(1);
}
#endif
return 0;
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index d92b3e753a6f..1e8e1450d5b0 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -7,7 +7,7 @@
#include "internal.h"
#include <linux/pagevec.h>
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
{
struct page *page;
@@ -16,7 +16,7 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
DBG_BUGON(page_ref_count(page) != 1);
list_del(&page->lru);
} else {
- page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0);
+ page = alloc_page(gfp);
}
return page;
}
@@ -149,8 +149,7 @@ static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
}
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp,
- bool cleanup)
+ struct erofs_workgroup *grp)
{
/*
* If managed cache is on, refcount of workgroups
@@ -188,8 +187,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink,
- bool cleanup)
+ unsigned long nr_shrink)
{
pgoff_t first_index = 0;
void *batch[PAGEVEC_SIZE];
@@ -208,7 +206,7 @@ repeat:
first_index = grp->index + 1;
/* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
+ if (!erofs_try_to_release_workgroup(sbi, grp))
continue;
++freed;
@@ -245,7 +243,8 @@ void erofs_shrinker_unregister(struct super_block *sb)
struct erofs_sb_info *const sbi = EROFS_SB(sb);
mutex_lock(&sbi->umount_mutex);
- erofs_shrink_workstation(sbi, ~0UL, true);
+ /* clean up all remaining workgroups in memory */
+ erofs_shrink_workstation(sbi, ~0UL);
spin_lock(&erofs_sb_list_lock);
list_del(&sbi->list);
@@ -294,7 +293,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
- freed += erofs_shrink_workstation(sbi, nr, false);
+ freed += erofs_shrink_workstation(sbi, nr);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fad80c97d247..ca99425a4536 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -337,9 +337,9 @@ retry:
return COLLECT_PRIMARY; /* :( better luck next time */
}
-static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
{
struct erofs_workgroup *grp;
struct z_erofs_pcluster *pcl;
@@ -349,20 +349,20 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag);
if (!grp)
- return NULL;
+ return -ENOENT;
pcl = container_of(grp, struct z_erofs_pcluster, obj);
if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
DBG_BUGON(1);
erofs_workgroup_put(grp);
- return ERR_PTR(-EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
cl = z_erofs_primarycollection(pcl);
if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
DBG_BUGON(1);
erofs_workgroup_put(grp);
- return ERR_PTR(-EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
length = READ_ONCE(pcl->length);
@@ -370,7 +370,7 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
DBG_BUGON(1);
erofs_workgroup_put(grp);
- return ERR_PTR(-EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
} else {
unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
@@ -394,12 +394,12 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
clt->tailpcl = NULL;
clt->pcl = pcl;
clt->cl = cl;
- return cl;
+ return 0;
}
-static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_register_collection(struct z_erofs_collector *clt,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
{
struct z_erofs_pcluster *pcl;
struct z_erofs_collection *cl;
@@ -408,7 +408,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
/* no available workgroup, let's allocate one */
pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
if (!pcl)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
z_erofs_pcluster_init_always(pcl);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
@@ -442,7 +442,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
if (err) {
mutex_unlock(&cl->lock);
kmem_cache_free(pcluster_cachep, pcl);
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
}
/* used to check tail merging loop due to corrupted images */
if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
@@ -450,14 +450,14 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
clt->owned_head = &pcl->next;
clt->pcl = pcl;
clt->cl = cl;
- return cl;
+ return 0;
}
static int z_erofs_collector_begin(struct z_erofs_collector *clt,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct z_erofs_collection *cl;
+ int ret;
DBG_BUGON(clt->cl);
@@ -471,19 +471,22 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
}
repeat:
- cl = cllookup(clt, inode, map);
- if (!cl) {
- cl = clregister(clt, inode, map);
+ ret = z_erofs_lookup_collection(clt, inode, map);
+ if (ret == -ENOENT) {
+ ret = z_erofs_register_collection(clt, inode, map);
- if (cl == ERR_PTR(-EAGAIN))
+ /* someone registered at the same time, give another try */
+ if (ret == -EAGAIN) {
+ cond_resched();
goto repeat;
+ }
}
- if (IS_ERR(cl))
- return PTR_ERR(cl);
+ if (ret)
+ return ret;
z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- cl->pagevec, cl->vcnt);
+ clt->cl->pagevec, clt->cl->vcnt);
clt->compressedpages = clt->pcl->compressed_pages;
if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
@@ -543,15 +546,6 @@ static bool z_erofs_collector_end(struct z_erofs_collector *clt)
return true;
}
-static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
- gfp_t gfp)
-{
- struct page *page = erofs_allocpage(pagepool, gfp, true);
-
- page->mapping = Z_EROFS_MAPPING_STAGING;
- return page;
-}
-
static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
unsigned int cachestrategy,
erofs_off_t la)
@@ -571,7 +565,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct list_head *pagepool)
{
struct inode *const inode = fe->inode;
- struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode);
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
@@ -658,8 +652,9 @@ retry:
/* should allocate an additional staging page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
- __stagingpage_alloc(pagepool, GFP_NOFS);
+ erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL);
+ newpage->mapping = Z_EROFS_MAPPING_STAGING;
err = z_erofs_attach_page(clt, newpage,
Z_EROFS_PAGE_TYPE_EXCLUSIVE);
if (!err)
@@ -698,13 +693,11 @@ err_out:
goto out;
}
-static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+ bool sync, int bios)
{
- tagptr1_t t = tagptr_init(tagptr1_t, ptr);
- struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t);
- bool background = tagptr_unfold_tags(t);
-
- if (!background) {
+ /* wake up the caller thread for sync decompression */
+ if (sync) {
unsigned long flags;
spin_lock_irqsave(&io->u.wait.lock, flags);
@@ -718,37 +711,30 @@ static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
queue_work(z_erofs_workqueue, &io->u.work);
}
-static inline void z_erofs_vle_read_endio(struct bio *bio)
+static void z_erofs_decompressqueue_endio(struct bio *bio)
{
- struct erofs_sb_info *sbi = NULL;
+ tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
+ struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
blk_status_t err = bio->bi_status;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
- bool cachemngd = false;
DBG_BUGON(PageUptodate(page));
DBG_BUGON(!page->mapping);
- if (!sbi && !z_erofs_page_is_staging(page))
- sbi = EROFS_SB(page->mapping->host->i_sb);
-
- /* sbi should already be gotten if the page is managed */
- if (sbi)
- cachemngd = erofs_page_is_managed(sbi, page);
-
if (err)
SetPageError(page);
- else if (cachemngd)
- SetPageUptodate(page);
- if (cachemngd)
+ if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
+ if (!err)
+ SetPageUptodate(page);
unlock_page(page);
+ }
}
-
- z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+ z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
bio_put(bio);
}
@@ -953,9 +939,8 @@ out:
return err;
}
-static void z_erofs_vle_unzip_all(struct super_block *sb,
- struct z_erofs_unzip_io *io,
- struct list_head *pagepool)
+static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+ struct list_head *pagepool)
{
z_erofs_next_pcluster_t owned = io->head;
@@ -971,21 +956,21 @@ static void z_erofs_vle_unzip_all(struct super_block *sb,
pcl = container_of(owned, struct z_erofs_pcluster, next);
owned = READ_ONCE(pcl->next);
- z_erofs_decompress_pcluster(sb, pcl, pagepool);
+ z_erofs_decompress_pcluster(io->sb, pcl, pagepool);
}
}
-static void z_erofs_vle_unzip_wq(struct work_struct *work)
+static void z_erofs_decompressqueue_work(struct work_struct *work)
{
- struct z_erofs_unzip_io_sb *iosb =
- container_of(work, struct z_erofs_unzip_io_sb, io.u.work);
+ struct z_erofs_decompressqueue *bgq =
+ container_of(work, struct z_erofs_decompressqueue, u.work);
LIST_HEAD(pagepool);
- DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool);
+ DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ z_erofs_decompress_queue(bgq, &pagepool);
put_pages_list(&pagepool);
- kvfree(iosb);
+ kvfree(bgq);
}
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
@@ -994,8 +979,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
struct address_space *mc,
gfp_t gfp)
{
- /* determined at compile time to avoid too many #ifdefs */
- const bool nocache = __builtin_constant_p(mc) ? !mc : false;
const pgoff_t index = pcl->obj.index;
bool tocache = false;
@@ -1016,7 +999,7 @@ repeat:
* the cached page has not been allocated and
* an placeholder is out there, prepare it now.
*/
- if (!nocache && page == PAGE_UNALLOCATED) {
+ if (page == PAGE_UNALLOCATED) {
tocache = true;
goto out_allocpage;
}
@@ -1029,21 +1012,6 @@ repeat:
mapping = READ_ONCE(page->mapping);
/*
- * if managed cache is disabled, it's no way to
- * get such a cached-like page.
- */
- if (nocache) {
- /* if managed cache is disabled, it is impossible `justfound' */
- DBG_BUGON(justfound);
-
- /* and it should be locked, not uptodate, and not truncated */
- DBG_BUGON(!PageLocked(page));
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(!mapping);
- goto out;
- }
-
- /*
* unmanaged (file) pages are all locked solidly,
* therefore it is impossible for `mapping' to be NULL.
*/
@@ -1093,50 +1061,52 @@ repeat:
unlock_page(page);
put_page(page);
out_allocpage:
- page = __stagingpage_alloc(pagepool, gfp);
- if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- list_add(&page->lru, pagepool);
- cpu_relax();
- goto repeat;
- }
- if (nocache || !tocache)
- goto out;
- if (add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
+ if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ /* non-LRU / non-movable temporary page is needed */
page->mapping = Z_EROFS_MAPPING_STAGING;
- goto out;
+ tocache = false;
}
+ if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+ if (tocache) {
+ /* since it added to managed cache successfully */
+ unlock_page(page);
+ put_page(page);
+ } else {
+ list_add(&page->lru, pagepool);
+ }
+ cond_resched();
+ goto repeat;
+ }
set_page_private(page, (unsigned long)pcl);
SetPagePrivate(page);
out: /* the only exit (for tracing and debugging) */
return page;
}
-static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb,
- struct z_erofs_unzip_io *io,
- bool foreground)
+static struct z_erofs_decompressqueue *
+jobqueue_init(struct super_block *sb,
+ struct z_erofs_decompressqueue *fgq, bool *fg)
{
- struct z_erofs_unzip_io_sb *iosb;
-
- if (foreground) {
- /* waitqueue available for foreground io */
- DBG_BUGON(!io);
+ struct z_erofs_decompressqueue *q;
- init_waitqueue_head(&io->u.wait);
- atomic_set(&io->pending_bios, 0);
- goto out;
+ if (fg && !*fg) {
+ q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
+ if (!q) {
+ *fg = true;
+ goto fg_out;
+ }
+ INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
+ } else {
+fg_out:
+ q = fgq;
+ init_waitqueue_head(&fgq->u.wait);
+ atomic_set(&fgq->pending_bios, 0);
}
-
- iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL);
- DBG_BUGON(!iosb);
-
- /* initialize fields in the allocated descriptor */
- io = &iosb->io;
- iosb->sb = sb;
- INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
-out:
- io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
- return io;
+ q->sb = sb;
+ q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+ return q;
}
/* define decompression jobqueue types */
@@ -1147,22 +1117,17 @@ enum {
};
static void *jobqueueset_init(struct super_block *sb,
- z_erofs_next_pcluster_t qtail[],
- struct z_erofs_unzip_io *q[],
- struct z_erofs_unzip_io *fgq,
- bool forcefg)
+ struct z_erofs_decompressqueue *q[],
+ struct z_erofs_decompressqueue *fgq, bool *fg)
{
/*
* if managed cache is enabled, bypass jobqueue is needed,
* no need to read from device for all pclusters in this queue.
*/
- q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true);
- qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
-
- q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg);
- qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
+ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
+ q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg);
- return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg));
+ return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg));
}
static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
@@ -1184,9 +1149,8 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
-static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
- unsigned int nr_bios,
- bool force_fg)
+static bool postsubmit_is_all_bypassed(struct z_erofs_decompressqueue *q[],
+ unsigned int nr_bios, bool force_fg)
{
/*
* although background is preferred, no one is pending for submission.
@@ -1195,19 +1159,19 @@ static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
if (force_fg || nr_bios)
return false;
- kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io));
+ kvfree(q[JQ_SUBMIT]);
return true;
}
-static bool z_erofs_vle_submit_all(struct super_block *sb,
- z_erofs_next_pcluster_t owned_head,
- struct list_head *pagepool,
- struct z_erofs_unzip_io *fgq,
- bool force_fg)
+static bool z_erofs_submit_queue(struct super_block *sb,
+ z_erofs_next_pcluster_t owned_head,
+ struct list_head *pagepool,
+ struct z_erofs_decompressqueue *fgq,
+ bool *force_fg)
{
- struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
- struct z_erofs_unzip_io *q[NR_JOBQUEUES];
+ struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
struct bio *bio;
void *bi_private;
/* since bio will be NULL, no need to initialize last_index */
@@ -1221,7 +1185,9 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
force_submit = false;
bio = NULL;
nr_bios = 0;
- bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg);
+ bi_private = jobqueueset_init(sb, q, fgq, force_fg);
+ qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
+ qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
/* by default, all need io submission */
q[JQ_SUBMIT]->head = owned_head;
@@ -1268,7 +1234,7 @@ submit_bio_retry:
if (!bio) {
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- bio->bi_end_io = z_erofs_vle_read_endio;
+ bio->bi_end_io = z_erofs_decompressqueue_endio;
bio_set_dev(bio, sb->s_bdev);
bio->bi_iter.bi_sector = (sector_t)(first_index + i) <<
LOG_SECTORS_PER_BLOCK;
@@ -1297,40 +1263,38 @@ skippage:
if (bio)
submit_bio(bio);
- if (postsubmit_is_all_bypassed(q, nr_bios, force_fg))
+ if (postsubmit_is_all_bypassed(q, nr_bios, *force_fg))
return true;
- z_erofs_vle_unzip_kickoff(bi_private, nr_bios);
+ z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
return true;
}
-static void z_erofs_submit_and_unzip(struct super_block *sb,
- struct z_erofs_collector *clt,
- struct list_head *pagepool,
- bool force_fg)
+static void z_erofs_runqueue(struct super_block *sb,
+ struct z_erofs_collector *clt,
+ struct list_head *pagepool, bool force_fg)
{
- struct z_erofs_unzip_io io[NR_JOBQUEUES];
+ struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (!z_erofs_vle_submit_all(sb, clt->owned_head,
- pagepool, io, force_fg))
+ if (!z_erofs_submit_queue(sb, clt->owned_head,
+ pagepool, io, &force_fg))
return;
- /* decompress no I/O pclusters immediately */
- z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool);
+ /* handle bypass queue (no i/o pclusters) immediately */
+ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
if (!force_fg)
return;
/* wait until all bios are completed */
- wait_event(io[JQ_SUBMIT].u.wait,
- !atomic_read(&io[JQ_SUBMIT].pending_bios));
+ io_wait_event(io[JQ_SUBMIT].u.wait,
+ !atomic_read(&io[JQ_SUBMIT].pending_bios));
- /* let's synchronous decompression */
- z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool);
+ /* handle synchronous decompress queue in the caller context */
+ z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
}
-static int z_erofs_vle_normalaccess_readpage(struct file *file,
- struct page *page)
+static int z_erofs_readpage(struct file *file, struct page *page)
{
struct inode *const inode = page->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
@@ -1345,7 +1309,7 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true);
+ z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
@@ -1364,10 +1328,8 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
return nr <= sbi->max_sync_decompress_pages;
}
-static int z_erofs_vle_normalaccess_readpages(struct file *filp,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned int nr_pages)
+static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned int nr_pages)
{
struct inode *const inode = mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -1422,7 +1384,7 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp,
(void)z_erofs_collector_end(&f.clt);
- z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync);
+ z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync);
if (f.map.mpage)
put_page(f.map.mpage);
@@ -1432,8 +1394,8 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp,
return 0;
}
-const struct address_space_operations z_erofs_vle_normalaccess_aops = {
- .readpage = z_erofs_vle_normalaccess_readpage,
- .readpages = z_erofs_vle_normalaccess_readpages,
+const struct address_space_operations z_erofs_aops = {
+ .readpage = z_erofs_readpage,
+ .readpages = z_erofs_readpages,
};
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index faf950189bd7..7824f5563a55 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -84,7 +84,8 @@ struct z_erofs_pcluster {
#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster)
-struct z_erofs_unzip_io {
+struct z_erofs_decompressqueue {
+ struct super_block *sb;
atomic_t pending_bios;
z_erofs_next_pcluster_t head;
@@ -94,11 +95,6 @@ struct z_erofs_unzip_io {
} u;
};
-struct z_erofs_unzip_io_sb {
- struct z_erofs_unzip_io io;
- struct super_block *sb;
-};
-
#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
struct page *page)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 6a26c293ae2d..736db3a4cdef 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -22,11 +22,11 @@ int z_erofs_fill_inode(struct inode *inode)
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
}
- inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops;
+ inode->i_mapping->a_ops = &z_erofs_aops;
return 0;
}
-static int fill_inode_lazy(struct inode *inode)
+static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
@@ -138,8 +138,8 @@ static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
return 0;
}
-static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn)
+static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned long lcn)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
@@ -311,13 +311,13 @@ out:
return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
}
-static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned int lcn)
+static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned int lcn)
{
const unsigned int datamode = EROFS_I(m->inode)->datalayout;
if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
- return vle_legacy_load_cluster_from_disk(m, lcn);
+ return legacy_load_cluster_from_disk(m, lcn);
if (datamode == EROFS_INODE_FLAT_COMPRESSION)
return compacted_load_cluster_from_disk(m, lcn);
@@ -325,8 +325,8 @@ static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
return -EINVAL;
}
-static int vle_extent_lookback(struct z_erofs_maprecorder *m,
- unsigned int lookback_distance)
+static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
+ unsigned int lookback_distance)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
struct erofs_map_blocks *const map = m->map;
@@ -343,7 +343,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m,
/* load extent head logical cluster if needed */
lcn -= lookback_distance;
- err = vle_load_cluster_from_disk(m, lcn);
+ err = z_erofs_load_cluster_from_disk(m, lcn);
if (err)
return err;
@@ -356,7 +356,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- return vle_extent_lookback(m, m->delta[0]);
+ return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
map->m_flags &= ~EROFS_MAP_ZIPPED;
/* fallthrough */
@@ -396,7 +396,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
goto out;
}
- err = fill_inode_lazy(inode);
+ err = z_erofs_fill_inode_lazy(inode);
if (err)
goto out;
@@ -405,7 +405,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
m.lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
- err = vle_load_cluster_from_disk(&m, m.lcn);
+ err = z_erofs_load_cluster_from_disk(&m, m.lcn);
if (err)
goto unmap_out;
@@ -436,7 +436,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
/* fallthrough */
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
/* get the correspoinding first chunk */
- err = vle_extent_lookback(&m, m.delta[0]);
+ err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
break;
diff --git a/fs/exec.c b/fs/exec.c
index 555e93c7dec8..a504ed68621d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,7 +59,6 @@
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
-#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
@@ -1015,7 +1014,7 @@ static int exec_mmap(struct mm_struct *mm)
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
- mm_release(tsk, old_mm);
+ exec_mm_release(tsk, old_mm);
if (old_mm) {
sync_mm_rss(old_mm);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e0cc55164505..fa9c951d3471 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -269,7 +269,7 @@ goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal,
ext2_fsblk_t group_first_block, group_last_block;
group_first_block = ext2_group_first_block_no(sb, group);
- group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1;
+ group_last_block = ext2_group_last_block_no(sb, group);
if ((rsv->_rsv_start > group_last_block) ||
(rsv->_rsv_end < group_first_block))
@@ -666,37 +666,24 @@ ext2_try_to_allocate(struct super_block *sb, int group,
unsigned long *count,
struct ext2_reserve_window *my_rsv)
{
- ext2_fsblk_t group_first_block;
+ ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group);
+ ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group);
ext2_grpblk_t start, end;
unsigned long num = 0;
+ start = 0;
+ end = group_last_block - group_first_block + 1;
/* we do allocation within the reservation window if we have a window */
if (my_rsv) {
- group_first_block = ext2_group_first_block_no(sb, group);
if (my_rsv->_rsv_start >= group_first_block)
start = my_rsv->_rsv_start - group_first_block;
- else
- /* reservation window cross group boundary */
- start = 0;
- end = my_rsv->_rsv_end - group_first_block + 1;
- if (end > EXT2_BLOCKS_PER_GROUP(sb))
- /* reservation window crosses group boundary */
- end = EXT2_BLOCKS_PER_GROUP(sb);
- if ((start <= grp_goal) && (grp_goal < end))
- start = grp_goal;
- else
+ if (my_rsv->_rsv_end < group_last_block)
+ end = my_rsv->_rsv_end - group_first_block + 1;
+ if (grp_goal < start || grp_goal >= end)
grp_goal = -1;
- } else {
- if (grp_goal > 0)
- start = grp_goal;
- else
- start = 0;
- end = EXT2_BLOCKS_PER_GROUP(sb);
}
-
BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb));
-repeat:
if (grp_goal < 0) {
grp_goal = find_next_usable_block(start, bitmap_bh, end);
if (grp_goal < 0)
@@ -711,32 +698,23 @@ repeat:
;
}
}
- start = grp_goal;
- if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal,
- bitmap_bh->b_data)) {
- /*
- * The block was allocated by another thread, or it was
- * allocated and then freed by another thread
- */
- start++;
- grp_goal++;
- if (start >= end)
- goto fail_access;
- goto repeat;
- }
- num++;
- grp_goal++;
- while (num < *count && grp_goal < end
- && !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group),
+ for (; num < *count && grp_goal < end; grp_goal++) {
+ if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group),
grp_goal, bitmap_bh->b_data)) {
+ if (num == 0)
+ continue;
+ break;
+ }
num++;
- grp_goal++;
}
+
+ if (num == 0)
+ goto fail_access;
+
*count = num;
return grp_goal - num;
fail_access:
- *count = num;
return -1;
}
@@ -754,10 +732,9 @@ fail_access:
* but we will shift to the place where start_block is,
* then start from there, when looking for a reservable space.
*
- * @size: the target new reservation window size
+ * @sb: the super block.
*
- * @group_first_block: the first block we consider to start
- * the real search from
+ * @start_block: the first block we consider to start the real search from
*
* @last_block:
* the maximum block number that our goal reservable space
@@ -908,7 +885,7 @@ static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv,
spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock;
group_first_block = ext2_group_first_block_no(sb, group);
- group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+ group_end_block = ext2_group_last_block_no(sb, group);
if (grp_goal < 0)
start_block = group_first_block;
@@ -1115,7 +1092,7 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
* first block is the block number of the first block in this group
*/
group_first_block = ext2_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+ group_last_block = ext2_group_last_block_no(sb, group);
/*
* Basically we will allocate a new block from inode's reservation
@@ -1313,6 +1290,13 @@ retry_alloc:
if (free_blocks > 0) {
grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
EXT2_BLOCKS_PER_GROUP(sb));
+ /*
+ * In case we retry allocation (due to fs reservation not
+ * working out or fs corruption), the bitmap_bh is non-null
+ * pointer and we have to release it before calling
+ * read_block_bitmap().
+ */
+ brelse(bitmap_bh);
bitmap_bh = read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
@@ -1404,6 +1388,7 @@ allocated:
* use. So we may want to selectively mark some of the blocks
* as free
*/
+ num = *count;
goto retry_alloc;
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 10ab238de9a6..8178bd38a9d6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -813,6 +813,18 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
}
+static inline ext2_fsblk_t
+ext2_group_last_block_no(struct super_block *sb, unsigned long group_no)
+{
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+ if (group_no == sbi->s_groups_count - 1)
+ return le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+ else
+ return ext2_group_first_block_no(sb, group_no) +
+ EXT2_BLOCKS_PER_GROUP(sb) - 1;
+}
+
#define ext2_set_bit __test_and_set_bit_le
#define ext2_clear_bit __test_and_clear_bit_le
#define ext2_test_bit test_bit_le
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7004ce581a32..119667e65890 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -701,10 +701,13 @@ static int ext2_get_blocks(struct inode *inode,
if (!partial) {
count++;
mutex_unlock(&ei->truncate_mutex);
- if (err)
- goto cleanup;
goto got_it;
}
+
+ if (err) {
+ mutex_unlock(&ei->truncate_mutex);
+ goto cleanup;
+ }
}
/*
@@ -801,7 +804,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
#ifdef CONFIG_FS_DAX
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned flags, struct iomap *iomap)
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1b853fb0b163..32a8d10b579d 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -145,10 +145,13 @@ setversion_out:
if (ei->i_block_alloc_info){
struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
rsv->rsv_goal_size = rsv_window_size;
+ } else {
+ ret = -ENOMEM;
}
+
mutex_unlock(&ei->truncate_mutex);
mnt_drop_write_file(filp);
- return 0;
+ return ret;
}
default:
return -ENOTTY;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 30c630d73f0f..bcffe25da2f0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -702,13 +702,7 @@ static int ext2_check_descriptors(struct super_block *sb)
for (i = 0; i < sbi->s_groups_count; i++) {
struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL);
ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i);
- ext2_fsblk_t last_block;
-
- if (i == sbi->s_groups_count - 1)
- last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
- else
- last_block = first_block +
- (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+ ext2_fsblk_t last_block = ext2_group_last_block_no(sb, i);
if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
le32_to_cpu(gdp->bg_block_bitmap) > last_block)
@@ -806,7 +800,6 @@ static unsigned long descriptor_loc(struct super_block *sb,
{
struct ext2_sb_info *sbi = EXT2_SB(sb);
unsigned long bg, first_meta_bg;
- int has_super = 0;
first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
@@ -814,10 +807,8 @@ static unsigned long descriptor_loc(struct super_block *sb,
nr < first_meta_bg)
return (logic_sb_block + nr + 1);
bg = sbi->s_desc_per_block * nr;
- if (ext2_bg_has_super(sb, bg))
- has_super = 1;
- return ext2_group_first_block_no(sb, bg) + has_super;
+ return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg);
}
static int ext2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b3a2cc7c0252..f8578caba40d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -198,6 +198,12 @@ struct ext4_system_blocks {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -211,8 +217,7 @@ typedef struct ext4_io_end {
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -1579,7 +1584,6 @@ enum {
EXT4_STATE_NO_EXPAND, /* No space for expansion */
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
- EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
@@ -2562,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2606,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -3266,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+ ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -3298,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred);
+
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3324,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@ -3381,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
extern const struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_report_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7c70b08d104c..d3b8cdea5df7 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb)
}
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks)
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds)
{
journal_t *journal;
int err;
- trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
@@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
journal = EXT4_SB(sb)->s_journal;
if (!journal)
return ext4_get_nojournal();
- return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
- type, line);
+ return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
+ GFP_NOFS, type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return ext4_get_nojournal();
sb = handle->h_journal->j_private;
- trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
- _RET_IP_);
+ trace_ext4_journal_start_reserved(sb,
+ jbd2_handle_buffer_credits(handle), _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0) {
jbd2_journal_free_reserved(handle);
@@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return handle;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (jbd2_handle_buffer_credits(handle) >= check_cred &&
+ handle->h_revoke_credits >= revoke_cred)
+ return 0;
+ extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
+ revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
+ return ext4_journal_extend(handle, extend_cred, revoke_cred);
+}
+
static void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh,
@@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle), err);
return err;
}
ext4_error_inode(inode, where, line,
@@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle),
+ err);
}
} else {
if (inode)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index ef8fcf7d0d3b..a6b9b66dbfad 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks);
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle)
return 0;
}
-static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
+ int blocks)
{
- if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
- return 0;
- return 1;
+ /* Freeing each metadata block can result in freeing one cluster */
+ return blocks * EXT4_SB(sb)->s_cluster_ratio;
+}
+
+static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
+{
+ return ext4_free_metadata_revoke_credits(sb, 8);
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits((inode)->i_sb))
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
+ ext4_trans_default_revoke_credits((inode)->i_sb))
-#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
- __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \
+ (revoke_creds))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int blocks, int rsv_blocks)
+ int blocks, int rsv_blocks,
+ int revoke_creds)
{
return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
- rsv_blocks);
+ rsv_blocks, revoke_creds);
}
#define ext4_journal_stop(handle) \
@@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void)
return journal_current_handle();
}
-static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_extend(handle, nblocks);
+ return jbd2_journal_extend(handle, nblocks, revoke);
return 0;
}
-static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+static inline int ext4_journal_restart(handle_t *handle, int nblocks,
+ int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_restart(handle, nblocks);
+ return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
return 0;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred);
+
+
+/*
+ * Ensure @handle has at least @check_creds credits available. If not,
+ * transaction will be extended or restarted to contain at least @extend_cred
+ * credits. Before restarting transaction @fn is executed to allow for cleanup
+ * before the transaction is restarted.
+ *
+ * The return value is < 0 in case of error, 0 in case the handle has enough
+ * credits or transaction extension succeeded, 1 in case transaction had to be
+ * restarted.
+ */
+#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \
+ revoke_cred, fn) \
+({ \
+ __label__ __ensure_end; \
+ int err = __ext4_journal_ensure_credits((handle), (check_cred), \
+ (extend_cred), (revoke_cred)); \
+ \
+ if (err <= 0) \
+ goto __ensure_end; \
+ err = (fn); \
+ if (err < 0) \
+ goto __ensure_end; \
+ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
+ if (err == 0) \
+ err = 1; \
+__ensure_end: \
+ err; \
+})
+
+/*
+ * Ensure given handle has at least requested amount of credits available,
+ * possibly restarting transaction if needed. We also make sure the transaction
+ * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
+ * as freeing one or two blocks is very common pattern and requesting this is
+ * very cheap.
+ */
+static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
+ int revoke_creds)
+{
+ return ext4_journal_ensure_credits_fn(handle, credits, credits,
+ revoke_creds, 0);
+}
+
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
@@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))) {
@@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
+static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
+{
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return 0;
+ if (!ext4_should_journal_data(inode))
+ return 0;
+ /*
+ * Data blocks in one extent are contiguous, just account for partial
+ * clusters at extent boundaries
+ */
+ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
+}
+
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index fb0f99dc8c22..0e8708b77da6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle,
static int ext4_find_delayed_extent(struct inode *inode,
struct extent_status *newes);
-static int ext4_ext_truncate_extend_restart(handle_t *handle,
- struct inode *inode,
- int needed)
+static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
- int err;
-
- if (!ext4_handle_valid(handle))
- return 0;
- if (handle->h_buffer_credits >= needed)
- return 0;
/*
- * If we need to extend the journal get a few extra blocks
- * while we're at it for efficiency's sake.
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
*/
- needed += 3;
- err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
- if (err <= 0)
- return err;
- err = ext4_truncate_restart_trans(handle, inode, needed);
- if (err == 0)
- err = -EAGAIN;
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
- return err;
+/*
+ * Make sure 'handle' has at least 'check_cred' credits. If not, restart
+ * transaction with 'restart_cred' credits. The function drops i_data_sem
+ * when restarting transaction and gets it after transaction is restarted.
+ *
+ * The function returns 0 on success, 1 if transaction had to be restarted,
+ * and < 0 in case of fatal error.
+ */
+int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred)
+{
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
+ revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
}
/*
@@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
- /*
- * The check for IO to unwritten extent is somewhat racy as we
- * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
- * dropping i_data_sem. But reserved blocks should save us in that
- * case.
- */
+
if (ext4_ext_is_unwritten(ex1) &&
- (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
- atomic_read(&EXT4_I(inode)->i_unwritten) ||
- (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+ ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
* group descriptor to release the extent tree block. If we
* can't get the journal credits, give up.
*/
- if (ext4_journal_extend(handle, 2))
+ if (ext4_journal_extend(handle, 2,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
return;
/*
@@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
- int depth = ext_depth(inode), credits;
+ int depth = ext_depth(inode), credits, revoke_credits;
struct ext4_extent_header *eh;
ext4_lblk_t a, b;
unsigned num;
@@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += (ext_depth(inode)) + 1;
}
credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- if (err)
+ /*
+ * We may end up freeing some index blocks and data from the
+ * punched range. Note that partial clusters are accounted for
+ * by ext4_free_data_revoke_credits().
+ */
+ revoke_credits =
+ ext4_free_metadata_revoke_credits(inode->i_sb,
+ ext_depth(inode)) +
+ ext4_free_data_revoke_credits(inode, b - a + 1);
+
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ credits, revoke_credits);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
+ depth + 1,
+ ext4_free_metadata_revoke_credits(inode->i_sb, depth));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ret = 0;
int ret2 = 0;
struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- /*
- * This is somewhat ugly but the idea is clear: When transaction is
- * reserved, everything goes into it. Otherwise we rather start several
- * smaller transactions for conversion of each extent separately.
- */
- if (handle) {
- handle = ext4_journal_start_reserved(handle,
- EXT4_HT_EXT_CONVERT);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- credits = 0;
- } else {
+ if (!handle) {
/*
* credits to insert 1 extent into extent tree
*/
@@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
if (ret <= 0 || ret2)
break;
}
- if (!credits)
- ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
+int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
+{
+ int ret, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
+
+ /*
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
+ */
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
+
+ if (handle)
+ err = ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : err;
+}
+
/*
* If newes is not existing extent (newes->ec_pblk equals zero) find
* delayed extent at start of newes and update newes accordingly and
@@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode,
* descriptor) for each block group; assume two block
* groups
*/
- if (handle->h_buffer_credits < 7) {
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- /* EAGAIN is success */
- if (err && err != -EAGAIN)
- return err;
- }
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
+ if (err < 0)
+ return err;
err = ext4_ext_get_access(handle, inode, path);
return err;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8d2bbcc2d813..6a7293a5cda2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,10 +29,58 @@
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
+#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "truncate.h"
+
+static bool ext4_dio_supported(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ return false;
+ if (fsverity_active(inode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_has_inline_data(inode))
+ return false;
+ return true;
+}
+
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ if (!ext4_dio_supported(inode)) {
+ inode_unlock_shared(inode);
+ /*
+ * Fallback to buffered I/O if the operation being performed on
+ * the inode is not supported by direct I/O. The IOCB_DIRECT
+ * flag needs to be cleared here in order to ensure that the
+ * direct I/O path within generic_file_read_iter() is not
+ * taken.
+ */
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ return generic_file_read_iter(iocb, to);
+ }
+
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
- if (IS_DAX(file_inode(iocb->ki_filp)))
+ if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_read_iter(iocb, to);
+
return generic_file_read_iter(iocb, to);
}
@@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_unwritten_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
-}
-
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return -EFBIG;
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
+
+ ret = file_modified(iocb->ki_filp);
+ if (ret)
+ return ret;
+
return iov_iter_count(from);
}
-#ifdef CONFIG_FS_DAX
-static ssize_t
-ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
- inode_lock(inode);
- }
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
- ret = file_remove_privs(iocb->ki_filp);
- if (ret)
- goto out;
- ret = file_update_time(iocb->ki_filp);
- if (ret)
- goto out;
- ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+
out:
inode_unlock(inode);
- if (ret > 0)
+ if (likely(ret > 0)) {
+ iocb->ki_pos += ret;
ret = generic_write_sync(iocb, ret);
+ }
+
return ret;
}
-#endif
-static ssize_t
-ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+ ssize_t written, size_t count)
{
+ handle_t *handle;
+ bool truncate = false;
+ u8 blkbits = inode->i_blkbits;
+ ext4_lblk_t written_blk, end_blk;
+
+ /*
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But, the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended.
+ */
+ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
+ if (offset + count <= EXT4_I(inode)->i_disksize) {
+ /*
+ * We need to ensure that the inode is removed from the orphan
+ * list if it has been added prematurely, due to writeback of
+ * delalloc blocks.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ return written;
+ }
+
+ if (written < 0)
+ goto truncate;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ written = PTR_ERR(handle);
+ goto truncate;
+ }
+
+ if (ext4_update_inode_size(inode, offset + written))
+ ext4_mark_inode_dirty(handle, inode);
+
+ /*
+ * We may need to truncate allocated but not written blocks beyond EOF.
+ */
+ written_blk = ALIGN(offset + written, 1 << blkbits);
+ end_blk = ALIGN(offset + count, 1 << blkbits);
+ if (written_blk < end_blk && ext4_can_truncate(inode))
+ truncate = true;
+
+ /*
+ * Remove the inode from the orphan list if it has been extended and
+ * everything went OK.
+ */
+ if (!truncate && inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+
+ if (truncate) {
+truncate:
+ ext4_truncate_failed_write(inode);
+ /*
+ * If the truncate operation failed early, then the inode may
+ * still be on the orphan list. In that case, we need to try
+ * remove the inode from the in-memory linked list.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return written;
+}
+
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned int flags)
+{
+ loff_t offset = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
- int o_direct = iocb->ki_flags & IOCB_DIRECT;
- int unaligned_aio = 0;
- int overwrite = 0;
+
+ if (error)
+ return error;
+
+ if (size && flags & IOMAP_DIO_UNWRITTEN)
+ return ext4_convert_unwritten_extents(NULL, inode,
+ offset, size);
+
+ return 0;
+}
+
+static const struct iomap_dio_ops ext4_dio_write_ops = {
+ .end_io = ext4_dio_write_end_io,
+};
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
ssize_t ret;
+ size_t count;
+ loff_t offset;
+ handle_t *handle;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ bool extend = false, overwrite = false, unaligned_aio = false;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ if (!ext4_dio_supported(inode)) {
+ inode_unlock(inode);
+ /*
+ * Fallback to buffered I/O if the inode does not support
+ * direct I/O.
+ */
+ return ext4_buffered_write_iter(iocb, from);
+ }
+
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0) {
+ inode_unlock(inode);
+ return ret;
+ }
+
+ /*
+ * Unaligned asynchronous direct I/O must be serialized among each
+ * other as the zeroing of partial blocks of two competing unaligned
+ * asynchronous direct I/O writes can result in data corruption.
+ */
+ offset = iocb->ki_pos;
+ count = iov_iter_count(from);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
+ unaligned_aio = true;
+ inode_dio_wait(inode);
+ }
+
+ /*
+ * Determine whether the I/O will overwrite allocated and initialized
+ * blocks. If so, check to see whether it is possible to take the
+ * dioread_nolock path.
+ */
+ if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
+ ext4_should_dioread_nolock(inode)) {
+ overwrite = true;
+ downgrade_write(&inode->i_rwsem);
+ }
+
+ if (offset + count > EXT4_I(inode)->i_disksize) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+
+ extend = true;
+ ext4_journal_stop(handle);
+ }
+
+ ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_aio || extend);
+
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
+
+out:
+ if (overwrite)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+
+ if (ret >= 0 && iov_iter_count(from)) {
+ ssize_t err;
+ loff_t endbyte;
+
+ offset = iocb->ki_pos;
+ err = ext4_buffered_write_iter(iocb, from);
+ if (err < 0)
+ return err;
+
+ /*
+ * We need to ensure that the pages within the page cache for
+ * the range covered by this I/O are written to disk and
+ * invalidated. This is in attempt to preserve the expected
+ * direct I/O semantics in the case we fallback to buffered I/O
+ * to complete off the I/O request.
+ */
+ ret += err;
+ endbyte = offset + err - 1;
+ err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
+ offset, endbyte);
+ if (!err)
+ invalidate_mapping_pages(iocb->ki_filp->f_mapping,
+ offset >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+ }
+
+ return ret;
+}
#ifdef CONFIG_FS_DAX
- if (IS_DAX(inode))
- return ext4_dax_write_iter(iocb, from);
-#endif
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret;
+ size_t count;
+ loff_t offset;
+ handle_t *handle;
+ bool extend = false;
+ struct inode *inode = file_inode(iocb->ki_filp);
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret <= 0)
goto out;
- /*
- * Unaligned direct AIO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned AIOs can result in data
- * corruption.
- */
- if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb) &&
- ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
- unaligned_aio = 1;
- ext4_unwritten_wait(inode);
- }
+ offset = iocb->ki_pos;
+ count = iov_iter_count(from);
- iocb->private = &overwrite;
- /* Check whether we do a DIO overwrite or not */
- if (o_direct && !unaligned_aio) {
- if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
- if (ext4_should_dioread_nolock(inode))
- overwrite = 1;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
+ if (offset + count > EXT4_I(inode)->i_disksize) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
goto out;
}
- }
- ret = __generic_file_write_iter(iocb, from);
- /*
- * Unaligned direct AIO must be the only IO in flight. Otherwise
- * overlapping aligned IO after unaligned might result in data
- * corruption.
- */
- if (ret == -EIOCBQUEUED && unaligned_aio)
- ext4_unwritten_wait(inode);
- inode_unlock(inode);
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
+ extend = true;
+ ext4_journal_stop(handle);
+ }
- return ret;
+ ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
inode_unlock(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
return ret;
}
+#endif
+
+static ssize_t
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ return -EIO;
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return ext4_dax_write_iter(iocb, from);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_write_iter(iocb, from);
+
+ return ext4_buffered_write_iter(iocb, from);
+}
#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
@@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
maxbytes, i_size_read(inode));
case SEEK_HOLE:
inode_lock_shared(inode);
- offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_hole(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
case SEEK_DATA:
inode_lock_shared(inode);
- offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_data(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5508baa11bb6..e10206e7f4bb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
+static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ int ret, err;
+
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (!(inode->i_state & I_DIRTY_ALL))
+ return ret;
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ return ret;
+
+ err = sync_inode_metadata(inode, 1);
+ if (!ret)
+ ret = err;
+
+ if (!ret)
+ ret = ext4_sync_parent(inode);
+ if (test_opt(inode->i_sb, BARRIER))
+ *needs_barrier = true;
+
+ return ret;
+}
+
+static int ext4_fsync_journal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ *needs_barrier = true;
+
+ return jbd2_complete_transaction(journal, commit_tid);
+}
+
/*
* akpm: A new design for ext4_sync_file().
*
@@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode)
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*/
-
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0, err;
- tid_t commit_tid;
bool needs_barrier = false;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
- if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ret = -EROFS;
goto out;
}
- if (!journal) {
- ret = __generic_file_fsync(file, start, end, datasync);
- if (!ret)
- ret = ext4_sync_parent(inode);
- if (test_opt(inode->i_sb, BARRIER))
- goto issue_flush;
- goto out;
- }
-
ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
+
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
+ if (!sbi->s_journal)
+ ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
+ else if (ext4_should_journal_data(inode))
ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
+ else
+ ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
- commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (journal->j_flags & JBD2_BARRIER &&
- !jbd2_trans_will_send_data_barrier(journal, commit_tid))
- needs_barrier = true;
- ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
- issue_flush:
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 764ff4c56233..dc333e8e51e8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -265,13 +265,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ext4_debug("freeing inode %lu\n", ino);
trace_ext4_free_inode(inode);
- /*
- * Note: we must free any quota before locking the superblock,
- * as writing the quota to disk may need the lock as well.
- */
dquot_initialize(inode);
dquot_free_inode(inode);
- dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -927,7 +922,7 @@ repeat_in_this_group:
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
handle_type, nblocks,
- 0);
+ 0, 0);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36699a131168..3a4ab70fe9e0 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle,
for (i = 0; i <= indirect_blks; i++) {
if (i == indirect_blks) {
new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
- } else
+ } else {
ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
ar->inode, ar->goal,
ar->flags & EXT4_MB_DELALLOC_RESERVED,
NULL, &err);
+ /* Simplify error cleanup... */
+ branch[i+1].bh = NULL;
+ }
if (err) {
i--;
goto failed;
@@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle,
}
return 0;
failed:
+ if (i == indirect_blks) {
+ /* Free data blocks */
+ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+ ar->len, 0);
+ i--;
+ }
for (; i >= 0; i--) {
/*
* We want to ext4_forget() only freshly allocated indirect
- * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
- * buffer at branch[0].bh is indirect block / inode already
- * existing before ext4_alloc_branch() was called.
+ * blocks. Buffer for new_blocks[i] is at branch[i+1].bh
+ * (buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called). Also
+ * because blocks are freshly allocated, we don't need to
+ * revoke them which is why we don't set
+ * EXT4_FREE_BLOCKS_METADATA.
*/
- if (i > 0 && i != indirect_blks && branch[i].bh)
- ext4_forget(handle, 1, ar->inode, branch[i].bh,
- branch[i].bh->b_blocknr);
- ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
- (i == indirect_blks) ? ar->len : 1, 0);
+ ext4_free_blocks(handle, ar->inode, branch[i+1].bh,
+ new_blocks[i], 1,
+ branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0);
}
return err;
}
@@ -689,27 +699,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
+static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int *dropped)
+{
+ int err;
+
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (unlikely(err))
+ return err;
+ }
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err))
+ return err;
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
+
/*
* Truncate transactions can be complex and absolutely huge. So we need to
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
*
* Try to extend this transaction for the purposes of truncation. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * extend fails, we restart transaction.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_ind_truncate_ensure_credits(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh,
+ int revoke_creds)
{
- if (!ext4_handle_valid(handle))
- return 0;
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
- return 0;
- return 1;
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_blocks_for_truncate(inode), revoke_creds,
+ ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (ret <= 0)
+ return ret;
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(ret))
+ return ret;
+ }
+ return 0;
}
/*
@@ -844,27 +890,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
return 1;
}
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (unlikely(err))
- goto out_err;
- }
- err = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(err))
- goto out_err;
- err = ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- if (unlikely(err))
- goto out_err;
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (unlikely(err))
- goto out_err;
- }
- }
+ err = ext4_ind_truncate_ensure_credits(handle, inode, bh,
+ ext4_free_data_revoke_credits(inode, count));
+ if (err < 0)
+ goto out_err;
for (p = first; p < last; p++)
*p = 0;
@@ -1047,11 +1076,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
if (ext4_handle_is_aborted(handle))
return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- }
+ if (ext4_ind_truncate_ensure_credits(handle, inode,
+ NULL,
+ ext4_free_metadata_revoke_credits(
+ inode->i_sb, 1)) < 0)
+ return;
/*
* The forget flag here is critical because if
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a7ca65177980..28f28de0c1b6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -164,39 +164,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode)
}
/*
- * Restart the transaction associated with *handle. This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
- int nblocks)
-{
- int ret;
-
- /*
- * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
- * moment, get_block can be called only for blocks inside i_size since
- * page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
- */
- BUG_ON(EXT4_JOURNAL(inode) == NULL);
- jbd_debug(2, "restarting handle %p\n", handle);
- up_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_journal_restart(handle, nblocks);
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- return ret;
-}
-
-/*
* Called at the last iput() if i_nlink is zero.
*/
void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
int err;
- int extra_credits = 3;
+ /*
+ * Credits for final inode cleanup and freeing:
+ * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+ * (xattr block freeing), bitmap, group descriptor (inode freeing)
+ */
+ int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
trace_ext4_evict_inode(inode);
@@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode)
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+ /*
+ * Block bitmap, group descriptor, and inode are accounted in both
+ * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+ */
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode)+extra_credits);
+ ext4_blocks_for_truncate(inode) + extra_credits - 3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -827,136 +810,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
#define DIO_MAX_BLOCKS 4096
/*
- * Get blocks function for the cases that need to start a transaction -
- * generally difference cases of direct IO and DAX IO. It also handles retries
- * in case of ENOSPC.
- */
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int flags)
-{
- int dio_credits;
- handle_t *handle;
- int retries = 0;
- int ret;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
- bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
- dio_credits = ext4_chunk_trans_blocks(inode,
- bh_result->b_size >> inode->i_blkbits);
-retry:
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = _ext4_get_block(inode, iblock, bh_result, flags);
- ext4_journal_stop(handle);
-
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
-}
-
-/* Get block function for DIO reads and writes to inodes without extents */
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- if (!create)
- return _ext4_get_block(inode, iblock, bh, 0);
- return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
-}
-
-/*
- * Get block function for AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete.
- */
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * When doing DIO using unwritten extents, we need io_end to convert
- * unwritten extents to written on IO completion. We allocate io_end
- * once we spot unwritten extent and store it in b_private. Generic
- * DIO code keeps b_private set and furthermore passes the value to
- * our completion callback in 'private' argument.
- */
- if (!ret && buffer_unwritten(bh_result)) {
- if (!bh_result->b_private) {
- ext4_io_end_t *io_end;
-
- io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!io_end)
- return -ENOMEM;
- bh_result->b_private = io_end;
- ext4_set_io_unwritten_flag(inode, io_end);
- }
- set_buffer_defer_completion(bh_result);
- }
-
- return ret;
-}
-
-/*
- * Get block function for non-AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete by ext4_direct_IO_write().
- */
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * Mark inode as having pending DIO writes to unwritten extents.
- * ext4_direct_IO_write() checks this flag and converts extents to
- * written.
- */
- if (!ret && buffer_unwritten(bh_result))
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-
- return ret;
-}
-
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
-
- ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
- inode->i_ino, create);
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * Blocks should have been preallocated! ext4_file_write_iter() checks
- * that.
- */
- WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
-
- return ret;
-}
-
-
-/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -2341,6 +2194,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
}
/*
+ * mpage_process_page - update page buffers corresponding to changed extent and
+ * may submit fully mapped page for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ * @m_lblk - logical block mapping.
+ * @m_pblk - corresponding physical mapping.
+ * @map_bh - determines on return whether this page requires any further
+ * mapping or not.
+ * Scan given page buffers corresponding to changed extent and update buffer
+ * state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits.
+ * If the given page is not fully mapped, we update @map to the next extent in
+ * the given page that needs mapping & return @map_bh as true.
+ */
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+ ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+ bool *map_bh)
+{
+ struct buffer_head *head, *bh;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ ext4_lblk_t lblk = *m_lblk;
+ ext4_fsblk_t pblock = *m_pblk;
+ int err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ssize_t io_end_size = 0;
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+
+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
+ if (err > 0)
+ err = 0;
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec)) {
+ err = PTR_ERR(io_end_vec);
+ goto out;
+ }
+ io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ }
+ *map_bh = true;
+ goto out;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ io_end_size += (1 << blkbits);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+ *map_bh = false;
+out:
+ *m_lblk = lblk;
+ *m_pblk = pblock;
+ return err;
+}
+
+/*
* mpage_map_buffers - update buffers corresponding to changed extent and
* submit fully mapped pages for IO
*
@@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
struct pagevec pvec;
int nr_pages, i;
struct inode *inode = mpd->inode;
- struct buffer_head *head, *bh;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
- sector_t pblock;
+ ext4_fsblk_t pblock;
int err;
+ bool map_bh = false;
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- bh = head = page_buffers(page);
- do {
- if (lblk < mpd->map.m_lblk)
- continue;
- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
- /*
- * Buffer after end of mapped extent.
- * Find next buffer in the page to map.
- */
- mpd->map.m_len = 0;
- mpd->map.m_flags = 0;
- /*
- * FIXME: If dioread_nolock supports
- * blocksize < pagesize, we need to make
- * sure we add size mapped so far to
- * io_end->size as the following call
- * can submit the page for IO.
- */
- err = mpage_process_page_bufs(mpd, head,
- bh, lblk);
- pagevec_release(&pvec);
- if (err > 0)
- err = 0;
- return err;
- }
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock++;
- }
- clear_buffer_unwritten(bh);
- } while (lblk++, (bh = bh->b_this_page) != head);
-
+ err = mpage_process_page(mpd, page, &lblk, &pblock,
+ &map_bh);
/*
- * FIXME: This is going to break if dioread_nolock
- * supports blocksize < pagesize as we will try to
- * convert potentially unmapped parts of inode.
+ * If map_bh is true, means page may require further bh
+ * mapping, or maybe the page was submitted for IO.
+ * So we return to call further extent mapping.
*/
- mpd->io_submit.io_end->size += PAGE_SIZE;
+ if (err < 0 || map_bh == true)
+ goto out;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
- if (err < 0) {
- pagevec_release(&pvec);
- return err;
- }
+ if (err < 0)
+ goto out;
}
pagevec_release(&pvec);
}
@@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
}
static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle,
int err;
loff_t disksize;
int progress = 0;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ struct ext4_io_end_vec *io_end_vec;
- mpd->io_submit.io_end->offset =
- ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec))
+ return PTR_ERR(io_end_vec);
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
@@ -3406,473 +3308,235 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
return inode->i_state & I_DIRTY_DATASYNC;
}
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned flags, struct iomap *iomap)
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+ struct ext4_map_blocks *map, loff_t offset,
+ loff_t length)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned int blkbits = inode->i_blkbits;
- unsigned long first_block, last_block;
- struct ext4_map_blocks map;
- bool delalloc = false;
- int ret;
-
- if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
- return -EINVAL;
- first_block = offset >> blkbits;
- last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
- EXT4_MAX_LOGICAL_BLOCK);
-
- if (flags & IOMAP_REPORT) {
- if (ext4_has_inline_data(inode)) {
- ret = ext4_inline_data_iomap(inode, iomap);
- if (ret != -EAGAIN) {
- if (ret == 0 && offset >= iomap->length)
- ret = -ENOENT;
- return ret;
- }
- }
- } else {
- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
- return -ERANGE;
- }
-
- map.m_lblk = first_block;
- map.m_len = last_block - first_block + 1;
-
- if (flags & IOMAP_REPORT) {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
-
- if (ret == 0) {
- ext4_lblk_t end = map.m_lblk + map.m_len - 1;
- struct extent_status es;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map.m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end) {
- /* entire range is a hole */
- } else if (es.es_lblk > map.m_lblk) {
- /* range starts with a hole */
- map.m_len = es.es_lblk - map.m_lblk;
- } else {
- ext4_lblk_t offs = 0;
-
- if (es.es_lblk < map.m_lblk)
- offs = map.m_lblk - es.es_lblk;
- map.m_lblk = es.es_lblk + offs;
- map.m_len = es.es_len - offs;
- delalloc = true;
- }
- }
- } else if (flags & IOMAP_WRITE) {
- int dio_credits;
- handle_t *handle;
- int retries = 0;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (map.m_len > DIO_MAX_BLOCKS)
- map.m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-retry:
- /*
- * Either we allocate blocks and then we don't get unwritten
- * extent so we have reserved enough credits, or the blocks
- * are already allocated and unwritten and in that case
- * extent conversion fits in the credits as well.
- */
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0) {
- ext4_journal_stop(handle);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
- }
-
- /*
- * If we added blocks beyond i_size, we need to make sure they
- * will get truncated if we crash before updating i_size in
- * ext4_iomap_end(). For faults we don't need to do that (and
- * even cannot because for orphan list operations inode_lock is
- * required) - if we happen to instantiate block beyond i_size,
- * it is because we race with truncate which has already added
- * the inode to the orphan list.
- */
- if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
- (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
- int err;
-
- err = ext4_orphan_add(handle, inode);
- if (err < 0) {
- ext4_journal_stop(handle);
- return err;
- }
- }
- ext4_journal_stop(handle);
- } else {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
- }
+ u8 blkbits = inode->i_blkbits;
+ /*
+ * Writes that span EOF might trigger an I/O size update on completion,
+ * so consider them to be dirty for the purpose of O_DSYNC, even if
+ * there is no other metadata changes being made or are pending.
+ */
iomap->flags = 0;
- if (ext4_inode_datasync_dirty(inode))
+ if (ext4_inode_datasync_dirty(inode) ||
+ offset + length > i_size_read(inode))
iomap->flags |= IOMAP_F_DIRTY;
+
+ if (map->m_flags & EXT4_MAP_NEW)
+ iomap->flags |= IOMAP_F_NEW;
+
iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = sbi->s_daxdev;
- iomap->offset = (u64)first_block << blkbits;
- iomap->length = (u64)map.m_len << blkbits;
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ iomap->offset = (u64) map->m_lblk << blkbits;
+ iomap->length = (u64) map->m_len << blkbits;
- if (ret == 0) {
- iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
+ /*
+ * Flags passed to ext4_map_blocks() for direct I/O writes can result
+ * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+ * set. In order for any allocated unwritten extents to be converted
+ * into written extents correctly within the ->end_io() handler, we
+ * need to ensure that the iomap->type is set appropriately. Hence, the
+ * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+ * been set first.
+ */
+ if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+ iomap->type = IOMAP_UNWRITTEN;
+ iomap->addr = (u64) map->m_pblk << blkbits;
+ } else if (map->m_flags & EXT4_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (u64) map->m_pblk << blkbits;
} else {
- if (map.m_flags & EXT4_MAP_MAPPED) {
- iomap->type = IOMAP_MAPPED;
- } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- iomap->type = IOMAP_UNWRITTEN;
- } else {
- WARN_ON_ONCE(1);
- return -EIO;
- }
- iomap->addr = (u64)map.m_pblk << blkbits;
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
}
-
- if (map.m_flags & EXT4_MAP_NEW)
- iomap->flags |= IOMAP_F_NEW;
-
- return 0;
}
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
- ssize_t written, unsigned flags, struct iomap *iomap)
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int flags)
{
- int ret = 0;
handle_t *handle;
- int blkbits = inode->i_blkbits;
- bool truncate = false;
+ u8 blkbits = inode->i_blkbits;
+ int ret, dio_credits, m_flags = 0, retries = 0;
- if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
- return 0;
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto orphan_del;
- }
- if (ext4_update_inode_size(inode, offset + written))
- ext4_mark_inode_dirty(handle, inode);
/*
- * We may need to truncate allocated but not written blocks beyond EOF.
+ * Trim the mapping request to the maximum value that we can map at
+ * once for direct I/O.
*/
- if (iomap->offset + iomap->length >
- ALIGN(inode->i_size, 1 << blkbits)) {
- ext4_lblk_t written_blk, end_blk;
+ if (map->m_len > DIO_MAX_BLOCKS)
+ map->m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
- written_blk = (offset + written) >> blkbits;
- end_blk = (offset + length) >> blkbits;
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
- }
+retry:
/*
- * Remove inode from orphan list if we were extending a inode and
- * everything went fine.
+ * Either we allocate blocks and then don't get an unwritten extent, so
+ * in that case we have reserved enough credits. Or, the blocks are
+ * already allocated and unwritten. In that case, the extent conversion
+ * fits into the credits as well.
*/
- if (!truncate && inode->i_nlink &&
- !list_empty(&EXT4_I(inode)->i_orphan))
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- if (truncate) {
- ext4_truncate_failed_write(inode);
-orphan_del:
- /*
- * If truncate failed early the inode might still be on the
- * orphan list; we need to make sure the inode is removed from
- * the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
- return ret;
-}
-
-const struct iomap_ops ext4_iomap_ops = {
- .iomap_begin = ext4_iomap_begin,
- .iomap_end = ext4_iomap_end,
-};
-
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private)
-{
- ext4_io_end_t *io_end = private;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- /* if not async direct IO just return */
- if (!io_end)
- return 0;
+ /*
+ * DAX and direct I/O are the only two operations that are currently
+ * supported with IOMAP_WRITE.
+ */
+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+ if (IS_DAX(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+ /*
+ * We use i_size instead of i_disksize here because delalloc writeback
+ * can complete at any point during the I/O and subsequently push the
+ * i_disksize out to i_size. This could be beyond where direct I/O is
+ * happening and thus expose allocated blocks to direct I/O reads.
+ */
+ else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE;
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ext_debug("ext4_end_io_dio(): io_end 0x%p "
- "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
- io_end, io_end->inode->i_ino, iocb, offset, size);
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
- * Error during AIO DIO. We cannot convert unwritten extents as the
- * data was not written. Just clear the unwritten flag and drop io_end.
+ * We cannot fill holes in indirect tree based inodes as that could
+ * expose stale data in the case of a crash. Use the magic error code
+ * to fallback to buffered I/O.
*/
- if (size <= 0) {
- ext4_clear_io_unwritten_flag(io_end);
- size = 0;
- }
- io_end->offset = offset;
- io_end->size = size;
- ext4_put_io_end(io_end);
+ if (!m_flags && !ret)
+ ret = -ENOTBLK;
- return 0;
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ return ret;
}
-/*
- * Handling of direct IO writes.
- *
- * For ext4 extent files, ext4 will do direct-io write even to holes,
- * preallocated extents, and those write extend the file, no need to
- * fall back to buffered IO.
- *
- * For holes, we fallocate those blocks, mark them as unwritten
- * If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as unwritten.
- *
- * The unwritten extents will be converted to written when DIO is completed.
- * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the conversion
- * when async direct IO completed.
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- */
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- ssize_t ret;
- loff_t offset = iocb->ki_pos;
- size_t count = iov_iter_count(iter);
- int overwrite = 0;
- get_block_t *get_block_func = NULL;
- int dio_flags = 0;
- loff_t final_size = offset + count;
- int orphan = 0;
- handle_t *handle;
+ int ret;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
- if (final_size > inode->i_size || final_size > ei->i_disksize) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ext4_update_i_disksize(inode, inode->i_size);
- ext4_journal_stop(handle);
- }
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
- BUG_ON(iocb->private == NULL);
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
/*
- * Make all waiters for direct IO properly wait also for extent
- * conversion. This also disallows race between truncate() and
- * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ * Calculate the first and last logical blocks respectively.
*/
- inode_dio_begin(inode);
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+ if (flags & IOMAP_WRITE)
+ ret = ext4_iomap_alloc(inode, &map, flags);
+ else
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+ if (ret < 0)
+ return ret;
- /* If we do a overwrite dio, i_mutex locking can be released */
- overwrite = *((int *)iocb->private);
+ ext4_set_iomap(inode, iomap, &map, offset, length);
- if (overwrite)
- inode_unlock(inode);
+ return 0;
+}
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
+{
/*
- * For extent mapped files we could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as unwritten to prevent
- * parallel buffered read to expose the stale data before DIO complete
- * the data IO.
- *
- * As to previously fallocated extents, ext4 get_block will just simply
- * mark the buffer mapped but still keep the extents unwritten.
- *
- * For non AIO case, we will convert those unwritten extents to written
- * after return back from blockdev_direct_IO. That way we save us from
- * allocating io_end structure and also the overhead of offloading
- * the extent convertion to a workqueue.
- *
- * For async DIO, the conversion needs to be deferred when the
- * IO is completed. The ext4 end_io callback function will be
- * called to take care of the conversion work. Here for async
- * case, we allocate an io_end structure to hook to the iocb.
+ * Check to see whether an error occurred while writing out the data to
+ * the allocated blocks. If so, return the magic error code so that we
+ * fallback to buffered I/O and attempt to complete the remainder of
+ * the I/O. Any blocks that may have been allocated in preparation for
+ * the direct I/O will be reused during buffered I/O.
*/
- iocb->private = NULL;
- if (overwrite)
- get_block_func = ext4_dio_get_block_overwrite;
- else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
- round_down(offset, i_blocksize(inode)) >= inode->i_size) {
- get_block_func = ext4_dio_get_block;
- dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
- } else if (is_sync_kiocb(iocb)) {
- get_block_func = ext4_dio_get_block_unwritten_sync;
- dio_flags = DIO_LOCKING;
- } else {
- get_block_func = ext4_dio_get_block_unwritten_async;
- dio_flags = DIO_LOCKING;
- }
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- get_block_func, ext4_end_io_dio, NULL,
- dio_flags);
+ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+ return -ENOTBLK;
- if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the conversion right here
- */
- err = ext4_convert_unwritten_extents(NULL, inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- }
+ return 0;
+}
- inode_dio_end(inode);
- /* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite)
- inode_lock(inode);
+const struct iomap_ops ext4_iomap_ops = {
+ .iomap_begin = ext4_iomap_begin,
+ .iomap_end = ext4_iomap_end,
+};
- if (ret < 0 && final_size > inode->i_size)
- ext4_truncate_failed_write(inode);
+static bool ext4_iomap_is_delalloc(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct extent_status es;
+ ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
- /* Handle extending of i_size after direct IO write */
- if (orphan) {
- int err;
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ map->m_lblk, end, &es);
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- /*
- * We wrote the data but cannot extend
- * i_size. Bail out. In async io case, we do
- * not return error here because we have
- * already submmitted the corresponding
- * bio. Returning error here makes the caller
- * think that this IO is done and failed
- * resulting in race with bio's completion
- * handler.
- */
- if (!ret)
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
+ if (!es.es_len || es.es_lblk > end)
+ return false;
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size || end > ei->i_disksize) {
- ext4_update_i_disksize(inode, end);
- if (end > inode->i_size)
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
+ if (es.es_lblk > map->m_lblk) {
+ map->m_len = es.es_lblk - map->m_lblk;
+ return false;
}
-out:
- return ret;
-}
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
- ssize_t ret;
+ offset = map->m_lblk - es.es_lblk;
+ map->m_len = es.es_len - offset;
- /*
- * Shared inode_lock is enough for us - it protects against concurrent
- * writes & truncates and since we take care of writing back page cache,
- * we are protected against page writeback as well.
- */
- inode_lock_shared(inode);
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret)
- goto out_unlock;
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, ext4_dio_get_block, NULL, NULL, 0);
-out_unlock:
- inode_unlock_shared(inode);
- return ret;
+ return true;
}
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- size_t count = iov_iter_count(iter);
- loff_t offset = iocb->ki_pos;
- ssize_t ret;
+ int ret;
+ bool delalloc = false;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
-#ifdef CONFIG_FS_ENCRYPTION
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
- return 0;
-#endif
- if (fsverity_active(inode))
- return 0;
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_inline_data_iomap(inode, iomap);
+ if (ret != -EAGAIN) {
+ if (ret == 0 && offset >= iomap->length)
+ ret = -ENOENT;
+ return ret;
+ }
+ }
/*
- * If we are doing data journalling we don't support O_DIRECT
+ * Calculate the first and last logical block respectively.
*/
- if (ext4_should_journal_data(inode))
- return 0;
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
- /* Let buffer I/O handle the inline data case. */
- if (ext4_has_inline_data(inode))
- return 0;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ delalloc = ext4_iomap_is_delalloc(inode, &map);
- trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == READ)
- ret = ext4_direct_IO_read(iocb, iter);
- else
- ret = ext4_direct_IO_write(iocb, iter);
- trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
- return ret;
+ ext4_set_iomap(inode, iomap, &map, offset, length);
+ if (delalloc && iomap->type == IOMAP_HOLE)
+ iomap->type = IOMAP_DELALLOC;
+
+ return 0;
}
+const struct iomap_ops ext4_iomap_report_ops = {
+ .iomap_begin = ext4_iomap_begin_report,
+};
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * All buffers in the last page remain valid? Then there's nothing to
- * do. We do the check mainly to optimize the common PAGE_SIZE ==
- * blocksize case
+ * If the page is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidatepage() may
+ * strip all buffers from the page but keep the page dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * buffers). Also we don't need to wait for any commit if all buffers in
+ * the page remain valid. This is most beneficial for the common case of
+ * blocksize == PAGESIZE.
*/
- if (offset > PAGE_SIZE - i_blocksize(inode))
+ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
@@ -5915,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode,
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
+ unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
int error;
+ /* this was checked at iget time, but double check for good measure */
+ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ return -EFSCORRUPTED;
+ }
+ if ((new_extra_isize < ei->i_extra_isize) ||
+ (new_extra_isize < 4) ||
+ (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+ return -EINVAL; /* Should never happen */
+
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
@@ -5968,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode,
* If this is felt to be critical, then e2fsck should be run to
* force a large enough s_min_extra_isize.
*/
- if (ext4_handle_valid(handle) &&
- jbd2_journal_extend(handle,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+ if (ext4_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
return -ENOSPC;
if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0b7f316fd30f..e8870fff8224 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1360,6 +1360,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
case EXT4_IOC_MOVE_EXT:
case EXT4_IOC_RESIZE_FS:
+ case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
case EXT4_IOC_SET_ENCRYPTION_POLICY:
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b1e4d359f73b..89725fa42573 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
needed = ext4_ext_calc_credits_for_single_extent(inode,
lb->last_block - lb->first_block + 1, path);
- /*
- * Make sure the credit we accumalated is not really high
- */
- if (needed && ext4_handle_has_enough_credits(handle,
- EXT4_RESERVE_TRANS_BLOCKS)) {
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- } else if (needed) {
- retval = ext4_journal_extend(handle, needed);
- if (retval) {
- /*
- * IF not able to extend the journal restart the journal
- */
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- }
- }
+ retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
+ if (retval < 0)
+ goto err_out;
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
@@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
}
-static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
-{
- int retval = 0, needed;
-
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- /*
- * We are freeing a blocks. During this we touch
- * superblock, group descriptor and block bitmap.
- * So allocate a credit of 3. We may update
- * quota (user and group).
- */
- needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- if (ext4_journal_extend(handle, needed) != 0)
- retval = ext4_journal_restart(handle, needed);
-
- return retval;
-}
-
static int free_dind_blocks(handle_t *handle,
struct inode *inode, __le32 i_data)
{
int i;
__le32 *tmp_idata;
struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+ int err;
- bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0) {
+ put_bh(bh);
+ return err;
+ }
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(tmp_idata[i]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle,
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0)
+ return err;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle,
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(i_data[0]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* One credit accounted for writing the
* i_data field of the original inode
*/
- retval = ext4_journal_extend(handle, 1);
- if (retval) {
- retval = ext4_journal_restart(handle, 1);
- if (retval)
- goto err_out;
- }
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto err_out;
i_data[0] = ei->i_data[EXT4_IND_BLOCK];
i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
@@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
ix = EXT_FIRST_INDEX(eh);
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
retval = free_ext_idx(handle, inode, ix);
- if (retval)
- break;
+ if (retval) {
+ put_bh(bh);
+ return retval;
+ }
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
- return retval;
+ return 0;
}
/*
@@ -574,9 +554,9 @@ err_out:
}
/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
- if (ext4_journal_extend(handle, 1) != 0)
- ext4_journal_restart(handle, 1);
-
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto out_stop;
/*
* Mark the tmp_inode as of size zero
*/
@@ -594,6 +574,7 @@ err_out:
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
+out_stop:
ext4_journal_stop(handle);
out:
unlock_new_inode(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a427d2031a8d..a856997d87b5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2547,18 +2547,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode)
}
+/*
+ * Add non-directory inode to a directory. On success, the inode reference is
+ * consumed by dentry is instantiation. This is also indicated by clearing of
+ * *inodep pointer. On failure, the caller is responsible for dropping the
+ * inode reference in the safe context.
+ */
static int ext4_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
+ struct dentry *dentry, struct inode **inodep)
{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = *inodep;
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
d_instantiate_new(dentry, inode);
+ *inodep = NULL;
return 0;
}
drop_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
- iput(inode);
return err;
}
@@ -2592,12 +2603,12 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2624,12 +2635,12 @@ retry:
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2779,10 +2790,12 @@ retry:
if (err) {
out_clear_inode:
clear_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
iput(inode);
- goto out_stop;
+ goto out_retry;
}
ext4_inc_count(handle, dir);
ext4_update_dx_flag(dir);
@@ -2796,6 +2809,7 @@ out_clear_inode:
out_stop:
if (handle)
ext4_journal_stop(handle);
+out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -3182,18 +3196,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- if (inode->i_nlink == 0) {
- ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
- set_nlink(inode, 1);
- }
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
- drop_nlink(inode);
+ if (inode->i_nlink == 0)
+ ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+ dentry->d_name.len, dentry->d_name.name);
+ else
+ drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
@@ -3328,12 +3341,11 @@ static int ext4_symlink(struct inode *dir,
inode->i_size = disk_link.len - 1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
+ err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
+ if (inode)
+ iput(inode);
goto out_free_encrypted_link;
err_drop_inode:
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 12ceadef32c5..24aeedb8fc75 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,18 +31,56 @@
#include "acl.h"
static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}
/*
@@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
+ ext4_free_io_end_vec(io_end);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
* cannot get to ext4_ext_truncate() before all IOs overlapping that range are
* completed (happens from ext4_free_ioend()).
*/
-static int ext4_end_io(ext4_io_end_t *io)
+static int ext4_end_io_end(ext4_io_end_t *io_end)
{
- struct inode *inode = io->inode;
- loff_t offset = io->offset;
- ssize_t size = io->size;
- handle_t *handle = io->handle;
+ struct inode *inode = io_end->inode;
+ handle_t *handle = io_end->handle;
int ret = 0;
- ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
- io, inode->i_ino, io->list.next, io->list.prev);
+ io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+ io_end->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ "(inode %lu, error %d)", inode->i_ino, ret);
}
- ext4_clear_io_unwritten_flag(io);
- ext4_release_io_end(io);
+ ext4_clear_io_unwritten_flag(io_end);
+ ext4_release_io_end(io_end);
return ret;
}
@@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
+ ext4_io_end_t *io_end, *io_end0, *io_end1;
if (list_empty(head))
return;
ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
- list_for_each_entry(io, head, list) {
- cur = &io->list;
+ list_for_each_entry(io_end, head, list) {
+ cur = &io_end->list;
before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
+ io_end0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
+ io_end1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
+ io_end, inode->i_ino, io_end0, io_end1);
}
#endif
}
@@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
static int ext4_do_flush_completed_IO(struct inode *inode,
struct list_head *head)
{
- ext4_io_end_t *io;
+ ext4_io_end_t *io_end;
struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
- io = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
- list_del_init(&io->list);
+ io_end = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io_end->list);
- err = ext4_end_io(io);
+ err = ext4_end_io_end(io_end);
if (unlikely(!ret && err))
ret = err;
}
@@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work)
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
- ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
- if (io) {
- io->inode = inode;
- INIT_LIST_HEAD(&io->list);
- atomic_set(&io->count, 1);
+ ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
+
+ if (io_end) {
+ io_end->inode = inode;
+ INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
+ atomic_set(&io_end->count, 1);
}
- return io;
+ return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+ list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
return;
}
@@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_extents(io_end->handle,
- io_end->inode, io_end->offset,
- io_end->size);
+ err = ext4_convert_unwritten_io_end_vec(io_end->handle,
+ io_end);
io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
@@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio)
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
+ "starting block %llu)",
bio->bi_status, inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping,
@@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io,
io->io_end = NULL;
}
-static int io_submit_init_bio(struct ext4_io_submit *io,
- struct buffer_head *bh)
+static void io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
struct bio *bio;
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- if (!bio)
- return -ENOMEM;
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
@@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
wbc_init_bio(io->io_wbc, bio);
- return 0;
}
-static int io_submit_add_bh(struct ext4_io_submit *io,
- struct inode *inode,
- struct page *page,
- struct buffer_head *bh)
+static void io_submit_add_bh(struct ext4_io_submit *io,
+ struct inode *inode,
+ struct page *page,
+ struct buffer_head *bh)
{
int ret;
@@ -388,9 +425,7 @@ submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init_bio(io, bh);
- if (ret)
- return ret;
+ io_submit_init_bio(io, bh);
io->io_bio->bi_write_hint = inode->i_write_hint;
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
@@ -398,7 +433,6 @@ submit_and_retry:
goto submit_and_retry;
wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
io->io_next_block++;
- return 0;
}
int ext4_bio_write_page(struct ext4_io_submit *io,
@@ -491,8 +525,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt;
}
- bounce_page = NULL;
- goto out;
+
+ printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+ redirty_page_for_writepage(wbc, page);
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ goto unlock;
}
}
@@ -500,30 +540,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
- if (ret) {
- /*
- * We only get here on ENOMEM. Not much else
- * we can do but mark the page as dirty, and
- * better luck next time.
- */
- break;
- }
+ io_submit_add_bh(io, inode,
+ bounce_page ? bounce_page : page, bh);
nr_submitted++;
clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
- /* Error stopped previous loop? Clean up buffers... */
- if (ret) {
- out:
- fscrypt_free_bounce_page(bounce_page);
- printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
- redirty_page_for_writepage(wbc, page);
- do {
- clear_buffer_async_write(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
+unlock:
unlock_page(page);
/* Nothing submitted - we have to end page writeback */
if (!nr_submitted)
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index a30b203fa461..fef7755300c3 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -360,10 +360,12 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (bio == NULL) {
struct bio_post_read_ctx *ctx;
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio)
- goto set_error_page;
ctx = get_bio_post_read_ctx(inode, bio, page->index);
if (IS_ERR(ctx)) {
bio_put(bio);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c0e9aef376a7..a8c0f2b5b6e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
return bh;
}
-/*
- * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
- * If that fails, restart the transaction & regain write access for the
- * buffer head which is used for block_bitmap modifications.
- */
-static int extend_or_restart_transaction(handle_t *handle, int thresh)
+static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits)
{
- int err;
-
- if (ext4_handle_has_enough_credits(handle, thresh))
- return 0;
-
- err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
- if (err < 0)
- return err;
- if (err) {
- err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
- if (err)
- return err;
- }
-
- return 0;
+ return ext4_journal_ensure_credits_fn(handle, credits,
+ EXT4_MAX_TRANS_DATA, 0, 0);
}
/*
@@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
continue;
}
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
return err;
bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
@@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
struct buffer_head *gdb;
ext4_debug("update backup group %#04llx\n", block);
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
gdb = sb_getblk(sb, block);
@@ -602,8 +584,8 @@ handle_bb:
/* Initialize block bitmap of the @group */
block = group_data[i].block_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
bh = bclean(handle, sb, block);
@@ -631,8 +613,8 @@ handle_ib:
/* Initialize inode bitmap of the @group */
block = group_data[i].inode_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
/* Mark unused entries in inode bitmap used */
bh = bclean(handle, sb, block);
@@ -1109,10 +1091,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
- if (ext4_handle_valid(handle) &&
- handle->h_buffer_credits == 0 &&
- ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
- (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
break;
if (meta_bg == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b3cbf8622eab..1d82b56d9b11 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1172,9 +1172,9 @@ void ext4_clear_inode(struct inode *inode)
{
invalidate_inode_buffers(inode);
clear_inode(inode);
- dquot_drop(inode);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1388,7 +1388,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -1406,7 +1405,7 @@ static const struct dquot_operations ext4_quota_operations = {
.destroy_dquot = dquot_destroy,
.get_projid = ext4_get_projid,
.get_inode_usage = ext4_get_inode_usage,
- .get_next_id = ext4_get_next_id,
+ .get_next_id = dquot_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -2065,7 +2064,7 @@ static int parse_options(char *options, struct super_block *sb,
unsigned int *journal_ioprio,
int is_remount)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -2119,16 +2118,6 @@ static int parse_options(char *options, struct super_block *sb,
}
}
#endif
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- int blocksize =
- BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-
- if (blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "dioread_nolock if block size != PAGE_SIZE");
- return 0;
- }
- }
return 1;
}
@@ -3569,12 +3558,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ unsigned def_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
- /* determine the minimum size of new large inodes, if present */
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
- sbi->s_want_extra_isize == 0) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
+ if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = 0;
+ return;
+ }
+ if (sbi->s_want_extra_isize < 4) {
+ sbi->s_want_extra_isize = def_extra_isize;
if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
@@ -3587,10 +3579,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
}
}
/* Check if enough inode space is available */
- if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
- sbi->s_inode_size) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
+ if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+ (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+ sbi->s_inode_size)) {
+ sbi->s_want_extra_isize = def_extra_isize;
ext4_msg(sb, KERN_INFO,
"required extra inode space not available");
}
@@ -4453,13 +4445,6 @@ no_journal:
}
}
- if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
- (blocksize != PAGE_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported blocksize for fs encryption");
- goto failed_mount_wq;
- }
-
if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
@@ -5849,7 +5834,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
- err = dquot_enable(qf_inode, type, format_id, flags);
+ err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
if (err)
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
iput(qf_inode);
@@ -6033,18 +6018,6 @@ out:
}
return len;
}
-
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
-{
- const struct quota_format_ops *ops;
-
- if (!sb_has_quota_loaded(sb, qid->type))
- return -ESRCH;
- ops = sb_dqopt(sb)->ops[qid->type];
- if (!ops || !ops->get_next_id)
- return -ENOSYS;
- return dquot_get_next_id(sb, qid);
-}
#endif
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 491f9ee4040e..8966a5439a22 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
return credits;
}
-static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
- int credits, struct buffer_head *bh,
- bool dirty, bool block_csum)
-{
- int error;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- if (handle->h_buffer_credits >= credits)
- return 0;
-
- error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
- if (!error)
- return 0;
- if (error < 0) {
- ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
- return error;
- }
-
- if (bh && dirty) {
- if (block_csum)
- ext4_xattr_block_csum_set(inode, bh);
- error = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (error) {
- ext4_warning(inode->i_sb, "Handle metadata (error %d)",
- error);
- return error;
- }
- }
-
- error = ext4_journal_restart(handle, credits);
- if (error) {
- ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
- return error;
- }
-
- if (bh) {
- error = ext4_journal_get_write_access(handle, bh);
- if (error) {
- ext4_warning(inode->i_sb,
- "Get write access failed (error %d)",
- error);
- return error;
- }
- }
- return 0;
-}
-
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
@@ -1149,6 +1100,24 @@ cleanup:
return saved_err;
}
+static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, bool block_csum, bool dirty)
+{
+ int error;
+
+ if (bh && dirty) {
+ if (block_csum)
+ ext4_xattr_block_csum_set(inode, bh);
+ error = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (error) {
+ ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+ error);
+ return error;
+ }
+ }
+ return 0;
+}
+
static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
struct buffer_head *bh,
@@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
continue;
}
- err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
- dirty, block_csum);
- if (err) {
+ err = ext4_journal_ensure_credits_fn(handle, credits, credits,
+ ext4_free_metadata_revoke_credits(parent->i_sb, 1),
+ ext4_xattr_restart_fn(handle, parent, bh, block_csum,
+ dirty));
+ if (err < 0) {
ext4_warning_inode(ea_inode, "Ensure credits err=%d",
err);
continue;
}
+ if (err > 0) {
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ ext4_warning_inode(ea_inode,
+ "Re-get write access err=%d",
+ err);
+ continue;
+ }
+ }
err = ext4_xattr_inode_dec_ref(handle, ea_inode);
if (err) {
@@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
flags & XATTR_CREATE);
brelse(bh);
- if (!ext4_handle_has_enough_credits(handle, credits)) {
+ if (jbd2_handle_buffer_credits(handle) < credits) {
error = -ENOSPC;
goto cleanup;
}
@@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
struct inode *ea_inode;
int error;
- error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
- NULL /* bh */,
- false /* dirty */,
- false /* block_csum */);
- if (error) {
+ error = ext4_journal_ensure_credits(handle, extra_credits,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (error < 0) {
EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
goto cleanup;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index a0eef95b9e0e..ffdaba0c55d2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -581,7 +581,7 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi)
if (time_to_inject(sbi, FAULT_ORPHAN)) {
spin_unlock(&im->ino_lock);
- f2fs_show_injection_info(FAULT_ORPHAN);
+ f2fs_show_injection_info(sbi, FAULT_ORPHAN);
return -ENOSPC;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5755e897a5f0..a034cd0ce021 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -29,6 +29,7 @@
#define NUM_PREALLOC_POST_READ_CTXS 128
static struct kmem_cache *bio_post_read_ctx_cache;
+static struct kmem_cache *bio_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static bool __is_cp_guaranteed(struct page *page)
@@ -167,9 +168,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio)
static void f2fs_read_end_io(struct bio *bio)
{
- if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)),
- FAULT_READ_IO)) {
- f2fs_show_injection_info(FAULT_READ_IO);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
+
+ if (time_to_inject(sbi, FAULT_READ_IO)) {
+ f2fs_show_injection_info(sbi, FAULT_READ_IO);
bio->bi_status = BLK_STS_IOERR;
}
@@ -191,7 +193,7 @@ static void f2fs_write_end_io(struct bio *bio)
struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
- f2fs_show_injection_info(FAULT_WRITE_IO);
+ f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
@@ -543,6 +545,126 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
return io_type_is_mergeable(io, fio);
}
+static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
+ struct page *page, enum temp_type temp)
+{
+ struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
+ struct bio_entry *be;
+
+ be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+ be->bio = bio;
+ bio_get(bio);
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE)
+ f2fs_bug_on(sbi, 1);
+
+ down_write(&io->bio_list_lock);
+ list_add_tail(&be->list, &io->bio_list);
+ up_write(&io->bio_list_lock);
+}
+
+static void del_bio_entry(struct bio_entry *be)
+{
+ list_del(&be->list);
+ kmem_cache_free(bio_entry_slab, be);
+}
+
+static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
+ struct page *page)
+{
+ enum temp_type temp;
+ bool found = false;
+ int ret = -EAGAIN;
+
+ for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) {
+ struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
+ struct list_head *head = &io->bio_list;
+ struct bio_entry *be;
+
+ down_write(&io->bio_list_lock);
+ list_for_each_entry(be, head, list) {
+ if (be->bio != *bio)
+ continue;
+
+ found = true;
+
+ if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
+ ret = 0;
+ break;
+ }
+
+ /* bio is full */
+ del_bio_entry(be);
+ __submit_bio(sbi, *bio, DATA);
+ break;
+ }
+ up_write(&io->bio_list_lock);
+ }
+
+ if (ret) {
+ bio_put(*bio);
+ *bio = NULL;
+ }
+
+ return ret;
+}
+
+void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
+ struct bio **bio, struct page *page)
+{
+ enum temp_type temp;
+ bool found = false;
+ struct bio *target = bio ? *bio : NULL;
+
+ for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) {
+ struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
+ struct list_head *head = &io->bio_list;
+ struct bio_entry *be;
+
+ if (list_empty(head))
+ continue;
+
+ down_read(&io->bio_list_lock);
+ list_for_each_entry(be, head, list) {
+ if (target)
+ found = (target == be->bio);
+ else
+ found = __has_merged_page(be->bio, NULL,
+ page, 0);
+ if (found)
+ break;
+ }
+ up_read(&io->bio_list_lock);
+
+ if (!found)
+ continue;
+
+ found = false;
+
+ down_write(&io->bio_list_lock);
+ list_for_each_entry(be, head, list) {
+ if (target)
+ found = (target == be->bio);
+ else
+ found = __has_merged_page(be->bio, NULL,
+ page, 0);
+ if (found) {
+ target = be->bio;
+ del_bio_entry(be);
+ break;
+ }
+ }
+ up_write(&io->bio_list_lock);
+ }
+
+ if (found)
+ __submit_bio(sbi, target, DATA);
+ if (bio && *bio) {
+ bio_put(*bio);
+ *bio = NULL;
+ }
+}
+
int f2fs_merge_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio = *fio->bio;
@@ -557,20 +679,17 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
f2fs_trace_ios(fio, 0);
if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
- fio->new_blkaddr)) {
- __submit_bio(fio->sbi, bio, fio->type);
- bio = NULL;
- }
+ fio->new_blkaddr))
+ f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL);
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_PAGES);
bio_set_op_attrs(bio, fio->op, fio->op_flags);
- }
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- __submit_bio(fio->sbi, bio, fio->type);
- bio = NULL;
- goto alloc_new;
+ add_bio_entry(fio->sbi, bio, page, fio->temp);
+ } else {
+ if (add_ipu_page(fio->sbi, &bio, page))
+ goto alloc_new;
}
if (fio->io_wbc)
@@ -584,19 +703,6 @@ alloc_new:
return 0;
}
-static void f2fs_submit_ipu_bio(struct f2fs_sb_info *sbi, struct bio **bio,
- struct page *page)
-{
- if (!bio)
- return;
-
- if (!__has_merged_page(*bio, NULL, page, 0))
- return;
-
- __submit_bio(sbi, *bio, DATA);
- *bio = NULL;
-}
-
void f2fs_submit_page_write(struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = fio->sbi;
@@ -2098,7 +2204,7 @@ static int __write_data_page(struct page *page, bool *submitted,
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_SHIFT;
- loff_t psize = (page->index + 1) << PAGE_SHIFT;
+ loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
@@ -2215,14 +2321,12 @@ out:
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
- !F2FS_I(inode)->cp_task) {
- f2fs_submit_ipu_bio(sbi, bio, page);
+ !F2FS_I(inode)->cp_task)
f2fs_balance_fs(sbi, need_balance_fs);
- }
if (unlikely(f2fs_cp_error(sbi))) {
- f2fs_submit_ipu_bio(sbi, bio, page);
f2fs_submit_merged_write(sbi, DATA);
+ f2fs_submit_merged_ipu_write(sbi, bio, NULL);
submitted = NULL;
}
@@ -2342,13 +2446,11 @@ continue_unlock:
}
if (PageWriteback(page)) {
- if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
f2fs_wait_on_page_writeback(page,
DATA, true, true);
- f2fs_submit_ipu_bio(sbi, &bio, page);
- } else {
+ else
goto continue_unlock;
- }
}
if (!clear_page_dirty_for_io(page))
@@ -2406,7 +2508,7 @@ continue_unlock:
NULL, 0, DATA);
/* submit cached bio of IPU write */
if (bio)
- __submit_bio(sbi, bio, DATA);
+ f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
return ret;
}
@@ -3211,8 +3313,22 @@ fail:
return -ENOMEM;
}
-void __exit f2fs_destroy_post_read_processing(void)
+void f2fs_destroy_post_read_processing(void)
{
mempool_destroy(bio_post_read_ctx_pool);
kmem_cache_destroy(bio_post_read_ctx_cache);
}
+
+int __init f2fs_init_bio_entry_cache(void)
+{
+ bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
+ sizeof(struct bio_entry));
+ if (!bio_entry_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void __exit f2fs_destroy_bio_entry_cache(void)
+{
+ kmem_cache_destroy(bio_entry_slab);
+}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 4033778bcbbf..c967cacf979e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -628,7 +628,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
start:
if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) {
- f2fs_show_injection_info(FAULT_DIR_DEPTH);
+ f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH);
return -ENOSPC;
}
@@ -919,8 +919,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
bit_pos++;
ctx->pos = start_pos + bit_pos;
printk_ratelimited(
- "%s, invalid namelen(0), ino:%u, run fsck to fix.",
- KERN_WARNING, le32_to_cpu(de->ino));
+ "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
+ KERN_WARNING, sbi->sb->s_id,
+ le32_to_cpu(de->ino));
set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4024790028aa..5a888a063c7f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -890,6 +890,7 @@ enum {
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
NO_CHECK_TYPE,
+ CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
};
struct flush_cmd {
@@ -1068,6 +1069,11 @@ struct f2fs_io_info {
unsigned char version; /* version of the node */
};
+struct bio_entry {
+ struct bio *bio;
+ struct list_head list;
+};
+
#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
@@ -1077,6 +1083,8 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
spinlock_t io_lock; /* serialize DATA/NODE IOs */
struct list_head io_list; /* track fios */
+ struct list_head bio_list; /* bio entry list head */
+ struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */
};
#define FDEV(i) (sbi->devs[i])
@@ -1289,11 +1297,13 @@ struct f2fs_sb_info {
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
/* for skip statistic */
+ unsigned int atomic_files; /* # of opened atomic file */
unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
u64 gc_pin_file_threshold;
+ struct rw_semaphore pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
@@ -1365,9 +1375,10 @@ struct f2fs_private_dio {
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
-#define f2fs_show_injection_info(type) \
- printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \
- KERN_INFO, f2fs_fault_name[type], \
+#define f2fs_show_injection_info(sbi, type) \
+ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \
+ KERN_INFO, sbi->sb->s_id, \
+ f2fs_fault_name[type], \
__func__, __builtin_return_address(0))
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
@@ -1387,7 +1398,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
return false;
}
#else
-#define f2fs_show_injection_info(type) do { } while (0)
+#define f2fs_show_injection_info(sbi, type) do { } while (0)
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
return false;
@@ -1772,7 +1783,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
return ret;
if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(FAULT_BLOCK);
+ f2fs_show_injection_info(sbi, FAULT_BLOCK);
release = *count;
goto release_quota;
}
@@ -2024,7 +2035,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
}
if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(FAULT_BLOCK);
+ f2fs_show_injection_info(sbi, FAULT_BLOCK);
goto enospc;
}
@@ -2139,7 +2150,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
return page;
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) {
- f2fs_show_injection_info(FAULT_PAGE_ALLOC);
+ f2fs_show_injection_info(F2FS_M_SB(mapping),
+ FAULT_PAGE_ALLOC);
return NULL;
}
}
@@ -2154,7 +2166,7 @@ static inline struct page *f2fs_pagecache_get_page(
int fgp_flags, gfp_t gfp_mask)
{
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
- f2fs_show_injection_info(FAULT_PAGE_GET);
+ f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET);
return NULL;
}
@@ -2223,7 +2235,7 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
return bio;
}
if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
- f2fs_show_injection_info(FAULT_ALLOC_BIO);
+ f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
return NULL;
}
@@ -2704,6 +2716,20 @@ static inline void clear_file(struct inode *inode, int type)
f2fs_mark_inode_dirty_sync(inode, true);
}
+static inline bool f2fs_is_time_consistent(struct inode *inode)
+{
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3,
+ &F2FS_I(inode)->i_crtime))
+ return false;
+ return true;
+}
+
static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
{
bool ret;
@@ -2721,14 +2747,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
i_size_read(inode) & ~PAGE_MASK)
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3,
- &F2FS_I(inode)->i_crtime))
+ if (!f2fs_is_time_consistent(inode))
return false;
down_read(&F2FS_I(inode)->i_sem);
@@ -2783,7 +2802,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
void *ret;
if (time_to_inject(sbi, FAULT_KMALLOC)) {
- f2fs_show_injection_info(FAULT_KMALLOC);
+ f2fs_show_injection_info(sbi, FAULT_KMALLOC);
return NULL;
}
@@ -2804,7 +2823,7 @@ static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
if (time_to_inject(sbi, FAULT_KVMALLOC)) {
- f2fs_show_injection_info(FAULT_KVMALLOC);
+ f2fs_show_injection_info(sbi, FAULT_KVMALLOC);
return NULL;
}
@@ -3102,7 +3121,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
@@ -3188,10 +3207,14 @@ void f2fs_destroy_checkpoint_caches(void);
*/
int f2fs_init_post_read_processing(void);
void f2fs_destroy_post_read_processing(void);
+int f2fs_init_bio_entry_cache(void);
+void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct page *page,
nid_t ino, enum page_type type);
+void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
+ struct bio **bio, struct page *page);
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
int f2fs_submit_page_bio(struct f2fs_io_info *fio);
int f2fs_merge_page_bio(struct f2fs_io_info *fio);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6a2e5b7d8fc7..85af112e868d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -681,7 +681,7 @@ int f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
- f2fs_show_injection_info(FAULT_TRUNCATE);
+ f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE);
return -EIO;
}
@@ -1142,7 +1142,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
}
dn.ofs_in_node++;
i++;
- new_size = (dst + i) << PAGE_SHIFT;
+ new_size = (loff_t)(dst + i) << PAGE_SHIFT;
if (dst_inode->i_size < new_size)
f2fs_i_size_write(dst_inode, new_size);
} while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
@@ -1548,12 +1548,44 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (off_end)
map.m_len++;
- if (f2fs_is_pinned_file(inode))
- map.m_seg_type = CURSEG_COLD_DATA;
+ if (!map.m_len)
+ return 0;
+
+ if (f2fs_is_pinned_file(inode)) {
+ block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
+ sbi->log_blocks_per_seg;
+ block_t done = 0;
+
+ if (map.m_len % sbi->blocks_per_seg)
+ len += sbi->blocks_per_seg;
+
+ map.m_len = sbi->blocks_per_seg;
+next_alloc:
+ if (has_not_enough_free_secs(sbi, 0,
+ GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
+ mutex_lock(&sbi->gc_mutex);
+ err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+ if (err && err != -ENODATA && err != -EAGAIN)
+ goto out_err;
+ }
+
+ down_write(&sbi->pin_sem);
+ map.m_seg_type = CURSEG_COLD_DATA_PINNED;
+ f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+ up_write(&sbi->pin_sem);
+
+ done += map.m_len;
+ len -= map.m_len;
+ map.m_lblk += map.m_len;
+ if (!err && len)
+ goto next_alloc;
- err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
- F2FS_GET_BLOCK_PRE_DIO :
- F2FS_GET_BLOCK_PRE_AIO));
+ map.m_len = done;
+ } else {
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ }
+out_err:
if (err) {
pgoff_t last_off;
@@ -1893,6 +1925,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
if (list_empty(&fi->inmem_ilist))
list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+ sbi->atomic_files++;
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
/* add inode in inmem_list first and set atomic_file */
@@ -3406,6 +3439,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
case F2FS_IOC_ABORT_VOLATILE_WRITE:
case F2FS_IOC_SHUTDOWN:
+ case FITRIM:
case F2FS_IOC_SET_ENCRYPTION_POLICY:
case F2FS_IOC_GET_ENCRYPTION_PWSALT:
case F2FS_IOC_GET_ENCRYPTION_POLICY:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5877bd729689..b3d399623290 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -54,7 +54,7 @@ static int gc_thread_func(void *data)
}
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
}
@@ -1012,8 +1012,14 @@ next_step:
block_t start_bidx;
nid_t nid = le32_to_cpu(entry->nid);
- /* stop BG_GC if there is not enough free sections. */
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
+ /*
+ * stop BG_GC if there is not enough free sections.
+ * Or, stop GC if the segment becomes fully valid caused by
+ * race condition along with SSR block allocation.
+ */
+ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
+ get_valid_blocks(sbi, segno, false) ==
+ sbi->blocks_per_seg)
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
@@ -1437,11 +1443,20 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
raw_sb->block_count = cpu_to_le64(block_count +
(long long)segs * sbi->blocks_per_seg);
+ if (f2fs_is_multi_device(sbi)) {
+ int last_dev = sbi->s_ndevs - 1;
+ int dev_segs =
+ le32_to_cpu(raw_sb->devs[last_dev].total_segments);
+
+ raw_sb->devs[last_dev].total_segments =
+ cpu_to_le32(dev_segs + segs);
+ }
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
{
int segs = secs * sbi->segs_per_sec;
+ long long blks = (long long)segs * sbi->blocks_per_seg;
long long user_block_count =
le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
@@ -1449,8 +1464,20 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
- F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count +
- (long long)segs * sbi->blocks_per_seg);
+ F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
+
+ if (f2fs_is_multi_device(sbi)) {
+ int last_dev = sbi->s_ndevs - 1;
+
+ FDEV(last_dev).total_segments =
+ (int)FDEV(last_dev).total_segments + segs;
+ FDEV(last_dev).end_blk =
+ (long long)FDEV(last_dev).end_blk + blks;
+#ifdef CONFIG_BLK_DEV_ZONED
+ FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz +
+ (int)(blks >> sbi->log_blocks_per_blkz);
+#endif
+ }
}
int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
@@ -1465,6 +1492,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
if (block_count > old_block_count)
return -EINVAL;
+ if (f2fs_is_multi_device(sbi)) {
+ int last_dev = sbi->s_ndevs - 1;
+ __u64 last_segs = FDEV(last_dev).total_segments;
+
+ if (block_count + last_segs * sbi->blocks_per_seg <=
+ old_block_count)
+ return -EINVAL;
+ }
+
/* new fs size should align to section size */
div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem);
if (rem)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index db4fec30c30d..502bd491336a 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -615,7 +615,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
- if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
+ /*
+ * atime could be updated without dirtying f2fs inode in lazytime mode
+ */
+ if (f2fs_is_time_consistent(inode) &&
+ !is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
if (!f2fs_is_checkpoint_ready(sbi))
@@ -677,7 +681,7 @@ retry:
err = f2fs_truncate(inode);
if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
- f2fs_show_injection_info(FAULT_EVICT_INODE);
+ f2fs_show_injection_info(sbi, FAULT_EVICT_INODE);
err = -EIO;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 4faf06e8bf89..a1c507b0b4ac 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -981,7 +981,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
- F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
@@ -1141,7 +1142,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
down_write(&F2FS_I(old_inode)->i_sem);
- file_lost_pino(old_inode);
+ if (!old_dir_entry)
+ file_lost_pino(old_inode);
+ else
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
@@ -1156,7 +1161,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
down_write(&F2FS_I(new_inode)->i_sem);
- file_lost_pino(new_inode);
+ if (!new_dir_entry)
+ file_lost_pino(new_inode);
+ else
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(new_inode, old_dir->i_ino);
up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8b66bc4c004b..3314a0f3405e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2349,7 +2349,6 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
if (ret) {
up_read(&nm_i->nat_tree_lock);
- f2fs_bug_on(sbi, !mount);
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
return ret;
}
@@ -2399,7 +2398,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct free_nid *i = NULL;
retry:
if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
- f2fs_show_injection_info(FAULT_ALLOC_NID);
+ f2fs_show_injection_info(sbi, FAULT_ALLOC_NID);
return false;
}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 783773e4560d..76477f71d4ee 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -711,7 +711,7 @@ next:
f2fs_put_page(page, 1);
}
if (!err)
- f2fs_allocate_new_segments(sbi);
+ f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2c997f94a3b2..56e81447e2f3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -288,6 +288,8 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure)
struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
struct inode *inode;
struct f2fs_inode_info *fi;
+ unsigned int count = sbi->atomic_files;
+ unsigned int looped = 0;
next:
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
if (list_empty(head)) {
@@ -296,22 +298,26 @@ next:
}
fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
inode = igrab(&fi->vfs_inode);
+ if (inode)
+ list_move_tail(&fi->inmem_ilist, head);
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
if (inode) {
if (gc_failure) {
- if (fi->i_gc_failures[GC_FAILURE_ATOMIC])
- goto drop;
- goto skip;
+ if (!fi->i_gc_failures[GC_FAILURE_ATOMIC])
+ goto skip;
}
-drop:
set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
f2fs_drop_inmem_pages(inode);
+skip:
iput(inode);
}
-skip:
congestion_wait(BLK_RW_ASYNC, HZ/50);
cond_resched();
+ if (gc_failure) {
+ if (++looped >= count)
+ return;
+ }
goto next;
}
@@ -327,13 +333,16 @@ void f2fs_drop_inmem_pages(struct inode *inode)
mutex_unlock(&fi->inmem_lock);
}
- clear_inode_flag(inode, FI_ATOMIC_FILE);
fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
stat_dec_atomic_write(inode);
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
if (!list_empty(&fi->inmem_ilist))
list_del_init(&fi->inmem_ilist);
+ if (f2fs_is_atomic_file(inode)) {
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+ sbi->atomic_files--;
+ }
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
}
@@ -480,7 +489,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
}
@@ -1008,8 +1017,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
if (dc->error)
printk_ratelimited(
- "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d",
- KERN_INFO, dc->lstart, dc->start, dc->len, dc->error);
+ "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d",
+ KERN_INFO, sbi->sb->s_id,
+ dc->lstart, dc->start, dc->len, dc->error);
__detach_discard_cmd(dcc, dc);
}
@@ -1149,7 +1159,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
dc->len += len;
if (time_to_inject(sbi, FAULT_DISCARD)) {
- f2fs_show_injection_info(FAULT_DISCARD);
+ f2fs_show_injection_info(sbi, FAULT_DISCARD);
err = -EIO;
goto submit;
}
@@ -2691,7 +2701,7 @@ unlock:
up_read(&SM_I(sbi)->curseg_lock);
}
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg;
unsigned int old_segno;
@@ -2700,10 +2710,17 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ if (type != NO_CHECK_TYPE && i != type)
+ continue;
+
curseg = CURSEG_I(sbi, i);
- old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
- locate_dirty_segment(sbi, old_segno);
+ if (type == NO_CHECK_TYPE || curseg->next_blkoff ||
+ get_valid_blocks(sbi, curseg->segno, false) ||
+ get_ckpt_valid_blocks(sbi, curseg->segno)) {
+ old_segno = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+ locate_dirty_segment(sbi, old_segno);
+ }
}
up_write(&SIT_I(sbi)->sentry_lock);
@@ -3069,6 +3086,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
+ bool put_pin_sem = false;
+
+ if (type == CURSEG_COLD_DATA) {
+ /* GC during CURSEG_COLD_DATA_PINNED allocation */
+ if (down_read_trylock(&sbi->pin_sem)) {
+ put_pin_sem = true;
+ } else {
+ type = CURSEG_WARM_DATA;
+ curseg = CURSEG_I(sbi, type);
+ }
+ } else if (type == CURSEG_COLD_DATA_PINNED) {
+ type = CURSEG_COLD_DATA;
+ }
down_read(&SM_I(sbi)->curseg_lock);
@@ -3134,6 +3164,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
up_read(&SM_I(sbi)->curseg_lock);
+
+ if (put_pin_sem)
+ up_read(&sbi->pin_sem);
}
static void update_device_state(struct f2fs_io_info *fio)
@@ -3380,7 +3413,10 @@ void f2fs_wait_on_page_writeback(struct page *page,
if (PageWriteback(page)) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ /* submit cached LFS IO */
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
+ /* sbumit cached IPU IO */
+ f2fs_submit_merged_ipu_write(sbi, NULL, page);
if (ordered) {
wait_on_page_writeback(page);
f2fs_bug_on(sbi, locked && PageWriteback(page));
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 325781a1ae4d..a95467b202ea 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -313,6 +313,8 @@ struct sit_entry_set {
*/
static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
{
+ if (type == CURSEG_COLD_DATA_PINNED)
+ type = CURSEG_COLD_DATA;
return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 197ad6b314de..5111e1ffe58a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1213,9 +1213,13 @@ static int f2fs_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = (dquot->dq_dqb.dqb_bsoftlimit ?
- dquot->dq_dqb.dqb_bsoftlimit :
- dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_bsoftlimit)
+ limit = dquot->dq_dqb.dqb_bsoftlimit;
+ if (dquot->dq_dqb.dqb_bhardlimit &&
+ (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
+ limit = dquot->dq_dqb.dqb_bhardlimit;
+
if (limit && buf->f_blocks > limit) {
curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
buf->f_blocks = limit;
@@ -1224,9 +1228,13 @@ static int f2fs_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit ?
- dquot->dq_dqb.dqb_isoftlimit :
- dquot->dq_dqb.dqb_ihardlimit;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_isoftlimit)
+ limit = dquot->dq_dqb.dqb_isoftlimit;
+ if (dquot->dq_dqb.dqb_ihardlimit &&
+ (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
+ limit = dquot->dq_dqb.dqb_ihardlimit;
+
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
@@ -1932,7 +1940,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
- err = dquot_enable(qf_inode, type, format_id, flags);
+ err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
iput(qf_inode);
return err;
}
@@ -2618,6 +2626,21 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
+ if (RDEV(0).path[0]) {
+ block_t dev_seg_count = le32_to_cpu(RDEV(0).total_segments);
+ int i = 1;
+
+ while (i < MAX_DEVICES && RDEV(i).path[0]) {
+ dev_seg_count += le32_to_cpu(RDEV(i).total_segments);
+ i++;
+ }
+ if (segment_count != dev_seg_count) {
+ f2fs_info(sbi, "Segment count (%u) mismatch with total segments from devices (%u)",
+ segment_count, dev_seg_count);
+ return -EFSCORRUPTED;
+ }
+ }
+
if (secs_per_zone > total_sections || !secs_per_zone) {
f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)",
secs_per_zone, total_sections);
@@ -2852,6 +2875,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
spin_lock_init(&sbi->dev_lock);
init_rwsem(&sbi->sb_lock);
+ init_rwsem(&sbi->pin_sem);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2946,6 +2970,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Unable to read %dth superblock",
block + 1);
err = -EIO;
+ *recovery = 1;
continue;
}
@@ -2955,6 +2980,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock",
block + 1);
brelse(bh);
+ *recovery = 1;
continue;
}
@@ -2967,10 +2993,6 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
brelse(bh);
}
- /* Fail to read any one of the superblocks*/
- if (err < 0)
- *recovery = 1;
-
/* No valid superblock */
if (!*raw_super)
kvfree(super);
@@ -3324,6 +3346,8 @@ try_onemore:
sbi->write_io[i][j].bio = NULL;
spin_lock_init(&sbi->write_io[i][j].io_lock);
INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
+ INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
+ init_rwsem(&sbi->write_io[i][j].bio_list_lock);
}
}
@@ -3735,8 +3759,13 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_post_read_processing();
if (err)
goto free_root_stats;
+ err = f2fs_init_bio_entry_cache();
+ if (err)
+ goto free_post_read;
return 0;
+free_post_read:
+ f2fs_destroy_post_read_processing();
free_root_stats:
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
@@ -3760,6 +3789,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
+ f2fs_destroy_bio_entry_cache();
f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index b558b64a4c9c..70945ceb9c0c 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_casefold(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "pin_file");
len += snprintf(buf + len, PAGE_SIZE - len, "\n");
return len;
}
@@ -443,6 +445,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
@@ -510,6 +513,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_idle),
ATTR_LIST(gc_urgent),
ATTR_LIST(reclaim_segments),
+ ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
ATTR_LIST(discard_granularity),
ATTR_LIST(batched_trim_sections),
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 181900af2576..296b3189448a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -539,8 +539,9 @@ out:
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct inode *inode = d_inode(dentry);
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
struct f2fs_xattr_entry *entry;
- void *base_addr;
+ void *base_addr, *last_base_addr;
int error = 0;
size_t rest = buffer_size;
@@ -550,6 +551,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (error)
return error;
+ last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
f2fs_xattr_handler(entry->e_name_index);
@@ -557,6 +560,15 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
size_t prefix_len;
size_t size;
+ if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
+ (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
+ f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ inode->i_ino);
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ error = -EFSCORRUPTED;
+ goto cleanup;
+ }
+
if (!handler || (handler->list && !handler->list(dentry)))
continue;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4614c0ba5f1c..bdc4503c00a3 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -172,15 +172,6 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
}
-#ifdef CONFIG_COMPAT
-static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
-
-{
- return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
static int fat_file_release(struct inode *inode, struct file *filp)
{
if ((filp->f_mode & FMODE_WRITE) &&
@@ -215,9 +206,7 @@ const struct file_operations fat_file_operations = {
.mmap = generic_file_mmap,
.release = fat_file_release,
.unlocked_ioctl = fat_generic_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = fat_generic_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/file.c b/fs/file.c
index b241ea7f1aa4..3da91a112bab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -795,7 +795,7 @@ unsigned long __fdget_pos(unsigned int fd)
unsigned long v = __fdget(fd);
struct file *file = (struct file *)(v & ~3);
- if (file && !(file->f_mode & FMODE_STREAM)) {
+ if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
if (file_count(file) > 1) {
v |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ed1abc9e33cf..d4e6691d2d92 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -705,7 +705,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->pipebufs++;
cs->nr_segs--;
} else {
- if (cs->nr_segs == cs->pipe->buffers)
+ if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
page = alloc_page(GFP_HIGHUSER);
@@ -881,7 +881,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
struct pipe_buffer *buf;
int err;
- if (cs->nr_segs == cs->pipe->buffers)
+ if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
err = unlock_request(cs->req);
@@ -1343,7 +1343,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (!fud)
return -EPERM;
- bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer),
+ bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
GFP_KERNEL);
if (!bufs)
return -ENOMEM;
@@ -1355,7 +1355,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (ret < 0)
goto out;
- if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+ if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) {
ret = -EIO;
goto out;
}
@@ -1937,6 +1937,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
{
+ unsigned int head, tail, mask, count;
unsigned nbuf;
unsigned idx;
struct pipe_buffer *bufs;
@@ -1951,8 +1952,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
- bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
- GFP_KERNEL);
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+ count = head - tail;
+
+ bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
if (!bufs) {
pipe_unlock(pipe);
return -ENOMEM;
@@ -1960,8 +1965,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
nbuf = 0;
rem = 0;
- for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
- rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+ for (idx = tail; idx < head && rem < len; idx++)
+ rem += pipe->bufs[idx & mask].len;
ret = -EINVAL;
if (rem < len)
@@ -1972,16 +1977,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
- BUG_ON(nbuf >= pipe->buffers);
- BUG_ON(!pipe->nrbufs);
- ibuf = &pipe->bufs[pipe->curbuf];
+ BUG_ON(nbuf >= pipe->ring_size);
+ BUG_ON(tail == head);
+ ibuf = &pipe->bufs[tail & mask];
obuf = &bufs[nbuf];
if (rem >= ibuf->len) {
*obuf = *ibuf;
ibuf->ops = NULL;
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe->tail = tail;
} else {
if (!pipe_buf_get(pipe, ibuf))
goto out_free;
@@ -2262,7 +2267,7 @@ const struct file_operations fuse_dev_operations = {
.release = fuse_dev_release,
.fasync = fuse_dev_fasync,
.unlocked_ioctl = fuse_dev_ioctl,
- .compat_ioctl = fuse_dev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f63df54a08c6..516103248272 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1149,7 +1149,8 @@ static inline bool gfs2_iomap_need_write_lock(unsigned flags)
}
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
- unsigned flags, struct iomap *iomap)
+ unsigned flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct metapath mp = { .mp_aheight = 1, };
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 997b326247e2..d07a295f9cac 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/compat.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/pagemap.h>
@@ -354,6 +355,31 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -ENOTTY;
}
+#ifdef CONFIG_COMPAT
+static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch(cmd) {
+ /* These are just misnamed, they actually get/put from/to user an int */
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ /* Keep this list in sync with gfs2_ioctl */
+ case FITRIM:
+ case FS_IOC_GETFSLABEL:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return gfs2_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#else
+#define gfs2_compat_ioctl NULL
+#endif
+
/**
* gfs2_size_hint - Give a hint to the size of a write request
* @filep: The struct file
@@ -732,7 +758,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (ret)
goto out_uninit;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL);
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
gfs2_glock_dq(&gh);
out_uninit:
@@ -767,7 +794,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (offset + len > i_size_read(&ip->i_inode))
goto out;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL);
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
out:
gfs2_glock_dq(&gh);
@@ -1293,6 +1321,7 @@ const struct file_operations gfs2_file_fops = {
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
@@ -1308,6 +1337,7 @@ const struct file_operations gfs2_file_fops = {
const struct file_operations gfs2_dir_fops = {
.iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
@@ -1324,6 +1354,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
@@ -1337,6 +1368,7 @@ const struct file_operations gfs2_file_fops_nolock = {
const struct file_operations gfs2_dir_fops_nolock = {
.iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d85230c84ef2..f32f15669996 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -325,4 +325,5 @@ const struct file_operations hpfs_dir_ops =
.release = hpfs_dir_release,
.fsync = hpfs_file_fsync,
.unlocked_ioctl = hpfs_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 1ecec124e76f..b36abf9cb345 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -215,6 +215,7 @@ const struct file_operations hpfs_file_ops =
.fsync = hpfs_file_fsync,
.splice_read = generic_file_splice_read,
.unlocked_ioctl = hpfs_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
const struct inode_operations hpfs_file_iops =
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a478df035651..d5c2a3158610 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
@@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* File creation. Allocate an inode, and we're done..
*/
-static int hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
+static int do_hugetlbfs_mknod(struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t dev,
+ bool tmpfile)
{
struct inode *inode;
int error = -ENOSPC;
@@ -824,13 +827,23 @@ static int hugetlbfs_mknod(struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (inode) {
dir->i_ctime = dir->i_mtime = current_time(dir);
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+ if (tmpfile) {
+ d_tmpfile(dentry, inode);
+ } else {
+ d_instantiate(dentry, inode);
+ dget(dentry);/* Extra count - pin the dentry in core */
+ }
error = 0;
}
return error;
}
+static int hugetlbfs_mknod(struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+}
+
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
@@ -844,6 +857,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
+static int hugetlbfs_tmpfile(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+}
+
static int hugetlbfs_symlink(struct inode *dir,
struct dentry *dentry, const char *symname)
{
@@ -1102,6 +1121,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = {
.mknod = hugetlbfs_mknod,
.rename = simple_rename,
.setattr = hugetlbfs_setattr,
+ .tmpfile = hugetlbfs_tmpfile,
};
static const struct inode_operations hugetlbfs_inode_operations = {
@@ -1461,28 +1481,41 @@ static int __init init_hugetlbfs_fs(void)
sizeof(struct hugetlbfs_inode_info),
0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
- goto out2;
+ goto out;
error = register_filesystem(&hugetlbfs_fs_type);
if (error)
- goto out;
+ goto out_free;
+ /* default hstate mount is required */
+ mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ if (IS_ERR(mnt)) {
+ error = PTR_ERR(mnt);
+ goto out_unreg;
+ }
+ hugetlbfs_vfsmount[default_hstate_idx] = mnt;
+
+ /* other hstates are optional */
i = 0;
for_each_hstate(h) {
+ if (i == default_hstate_idx)
+ continue;
+
mnt = mount_one_hugetlbfs(h);
- if (IS_ERR(mnt) && i == 0) {
- error = PTR_ERR(mnt);
- goto out;
- }
- hugetlbfs_vfsmount[i] = mnt;
+ if (IS_ERR(mnt))
+ hugetlbfs_vfsmount[i] = NULL;
+ else
+ hugetlbfs_vfsmount[i] = mnt;
i++;
}
return 0;
- out:
+ out_unreg:
+ (void)unregister_filesystem(&hugetlbfs_fs_type);
+ out_free:
kmem_cache_destroy(hugetlbfs_inode_cachep);
- out2:
+ out:
return error;
}
fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9174007ce107..91b85df0861e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -33,6 +33,7 @@ enum {
enum {
IO_WQ_BIT_EXIT = 0, /* wq exiting */
IO_WQ_BIT_CANCEL = 1, /* cancel work on list */
+ IO_WQ_BIT_ERROR = 2, /* error on setup */
};
enum {
@@ -56,6 +57,7 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
+ const struct cred *creds;
struct files_struct *restore_files;
};
@@ -82,7 +84,7 @@ enum {
struct io_wqe {
struct {
spinlock_t lock;
- struct list_head work_list;
+ struct io_wq_work_list work_list;
unsigned long hash_map;
unsigned flags;
} ____cacheline_aligned_in_smp;
@@ -103,13 +105,13 @@ struct io_wqe {
struct io_wq {
struct io_wqe **wqes;
unsigned long state;
- unsigned nr_wqes;
get_work_fn *get_work;
put_work_fn *put_work;
struct task_struct *manager;
struct user_struct *user;
+ struct cred *creds;
struct mm_struct *mm;
refcount_t refs;
struct completion done;
@@ -135,6 +137,11 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
bool dropped_lock = false;
+ if (worker->creds) {
+ revert_creds(worker->creds);
+ worker->creds = NULL;
+ }
+
if (current->files != worker->restore_files) {
__acquire(&wqe->lock);
spin_unlock_irq(&wqe->lock);
@@ -229,7 +236,8 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
- if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED))
+ if (!wq_list_empty(&wqe->work_list) &&
+ !(wqe->flags & IO_WQE_FLAG_STALLED))
return true;
return false;
}
@@ -327,9 +335,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
* If worker is moving from bound to unbound (or vice versa), then
* ensure we update the running accounting.
*/
- worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
- work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
- if (worker_bound != work_bound) {
+ worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
+ work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
+ if (worker_bound != work_bound) {
io_wqe_dec_running(wqe, worker);
if (work_bound) {
worker->flags |= IO_WORKER_F_BOUND;
@@ -368,12 +376,15 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
__must_hold(wqe->lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
- list_for_each_entry(work, &wqe->work_list, list) {
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
/* not hashed, can run anytime */
if (!(work->flags & IO_WQ_WORK_HASHED)) {
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
return work;
}
@@ -381,7 +392,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
*hash = work->flags >> IO_WQ_HASH_SHIFT;
if (!(wqe->hash_map & BIT_ULL(*hash))) {
wqe->hash_map |= BIT_ULL(*hash);
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
return work;
}
}
@@ -409,7 +420,7 @@ static void io_worker_handle_work(struct io_worker *worker)
work = io_get_next_work(wqe, &hash);
if (work)
__io_worker_busy(wqe, worker, work);
- else if (!list_empty(&wqe->work_list))
+ else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock);
@@ -426,6 +437,9 @@ next:
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
+ if (work->flags & IO_WQ_WORK_CB)
+ work->func(&work);
+
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
current->files != work->files) {
task_lock(current);
@@ -438,6 +452,8 @@ next:
set_fs(USER_DS);
worker->mm = wq->mm;
}
+ if (!worker->creds)
+ worker->creds = override_creds(wq->creds);
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
@@ -514,7 +530,7 @@ static int io_wqe_worker(void *data)
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
spin_lock_irq(&wqe->lock);
- if (!list_empty(&wqe->work_list))
+ if (!wq_list_empty(&wqe->work_list))
io_worker_handle_work(worker);
else
spin_unlock_irq(&wqe->lock);
@@ -562,14 +578,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
spin_unlock_irq(&wqe->lock);
}
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct =&wqe->acct[index];
struct io_worker *worker;
- worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node);
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker)
- return;
+ return false;
refcount_set(&worker->ref, 1);
worker->nulls_node.pprev = NULL;
@@ -581,7 +597,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
"io_wqe_worker-%d/%d", index, wqe->node);
if (IS_ERR(worker->task)) {
kfree(worker);
- return;
+ return false;
}
spin_lock_irq(&wqe->lock);
@@ -599,6 +615,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
atomic_inc(&wq->user->processes);
wake_up_process(worker->task);
+ return true;
}
static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
@@ -606,9 +623,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct = &wqe->acct[index];
- /* always ensure we have one bounded worker */
- if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers)
- return true;
/* if we have available workers or no work, no need */
if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
return false;
@@ -621,12 +635,22 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
static int io_wq_manager(void *data)
{
struct io_wq *wq = data;
+ int workers_to_create = num_possible_nodes();
+ int node;
- while (!kthread_should_stop()) {
- int i;
+ /* create fixed workers */
+ refcount_set(&wq->refs, workers_to_create);
+ for_each_node(node) {
+ if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
+ goto err;
+ workers_to_create--;
+ }
+
+ complete(&wq->done);
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ while (!kthread_should_stop()) {
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
spin_lock_irq(&wqe->lock);
@@ -645,6 +669,12 @@ static int io_wq_manager(void *data)
}
return 0;
+err:
+ set_bit(IO_WQ_BIT_ERROR, &wq->state);
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ if (refcount_sub_and_test(workers_to_create, &wq->refs))
+ complete(&wq->done);
+ return 0;
}
static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
@@ -688,7 +718,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
}
spin_lock_irqsave(&wqe->lock, flags);
- list_add_tail(&work->list, &wqe->work_list);
+ wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
@@ -750,7 +780,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
void io_wq_cancel_all(struct io_wq *wq)
{
- int i;
+ int node;
set_bit(IO_WQ_BIT_CANCEL, &wq->state);
@@ -759,8 +789,8 @@ void io_wq_cancel_all(struct io_wq *wq)
* to a worker and the worker putting itself on the busy_list
*/
rcu_read_lock();
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
}
@@ -803,14 +833,17 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
.cancel = cancel,
.caller_data = cancel_data,
};
+ struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
spin_lock_irqsave(&wqe->lock, flags);
- list_for_each_entry(work, &wqe->work_list, list) {
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
if (cancel(work, cancel_data)) {
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
}
@@ -833,10 +866,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data)
{
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int i;
+ int node;
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_cb_work(wqe, cancel, data);
if (ret != IO_WQ_CANCEL_NOTFOUND)
@@ -868,6 +901,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
struct io_wq_work *cwork)
{
+ struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
@@ -880,9 +914,11 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
* no completion will be posted for it.
*/
spin_lock_irqsave(&wqe->lock, flags);
- list_for_each_entry(work, &wqe->work_list, list) {
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
if (work == cwork) {
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
}
@@ -910,10 +946,10 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int i;
+ int node;
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_work(wqe, cwork);
if (ret != IO_WQ_CANCEL_NOTFOUND)
@@ -944,10 +980,10 @@ static void io_wq_flush_func(struct io_wq_work **workptr)
void io_wq_flush(struct io_wq *wq)
{
struct io_wq_flush_data data;
- int i;
+ int node;
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
init_completion(&data.done);
INIT_IO_WORK(&data.work, io_wq_flush_func);
@@ -957,43 +993,39 @@ void io_wq_flush(struct io_wq *wq)
}
}
-struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
- struct user_struct *user, get_work_fn *get_work,
- put_work_fn *put_work)
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
- int ret = -ENOMEM, i, node;
+ int ret = -ENOMEM, node;
struct io_wq *wq;
- wq = kcalloc(1, sizeof(*wq), GFP_KERNEL);
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
- wq->nr_wqes = num_online_nodes();
- wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL);
+ wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
if (!wq->wqes) {
kfree(wq);
return ERR_PTR(-ENOMEM);
}
- wq->get_work = get_work;
- wq->put_work = put_work;
+ wq->get_work = data->get_work;
+ wq->put_work = data->put_work;
/* caller must already hold a reference to this */
- wq->user = user;
+ wq->user = data->user;
+ wq->creds = data->creds;
- i = 0;
- refcount_set(&wq->refs, wq->nr_wqes);
- for_each_online_node(node) {
+ for_each_node(node) {
struct io_wqe *wqe;
- wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node);
+ wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
if (!wqe)
- break;
- wq->wqes[i] = wqe;
+ goto err;
+ wq->wqes[node] = wqe;
wqe->node = node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
- if (user) {
+ if (wq->user) {
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
}
@@ -1001,33 +1033,36 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
wqe->node = node;
wqe->wq = wq;
spin_lock_init(&wqe->lock);
- INIT_LIST_HEAD(&wqe->work_list);
+ INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
INIT_LIST_HEAD(&wqe->all_list);
-
- i++;
}
init_completion(&wq->done);
- if (i != wq->nr_wqes)
- goto err;
-
/* caller must have already done mmgrab() on this mm */
- wq->mm = mm;
+ wq->mm = data->mm;
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
+ wait_for_completion(&wq->done);
+ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ reinit_completion(&wq->done);
return wq;
}
ret = PTR_ERR(wq->manager);
- wq->manager = NULL;
-err:
complete(&wq->done);
- io_wq_destroy(wq);
+err:
+ for_each_node(node)
+ kfree(wq->wqes[node]);
+ kfree(wq->wqes);
+ kfree(wq);
return ERR_PTR(ret);
}
@@ -1039,27 +1074,21 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
void io_wq_destroy(struct io_wq *wq)
{
- int i;
+ int node;
- if (wq->manager) {
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ if (wq->manager)
kthread_stop(wq->manager);
- }
rcu_read_lock();
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
-
- if (!wqe)
- continue;
- io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
- }
+ for_each_node(node)
+ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
rcu_read_unlock();
wait_for_completion(&wq->done);
- for (i = 0; i < wq->nr_wqes; i++)
- kfree(wq->wqes[i]);
+ for_each_node(node)
+ kfree(wq->wqes[node]);
kfree(wq->wqes);
kfree(wq);
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 4b29f922f80c..600e0158cba7 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -11,6 +11,7 @@ enum {
IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
+ IO_WQ_WORK_CB = 128,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -21,15 +22,60 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
+struct io_wq_work_node {
+ struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+ struct io_wq_work_node *first;
+ struct io_wq_work_node *last;
+};
+
+static inline void wq_list_add_tail(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ if (!list->first) {
+ list->first = list->last = node;
+ } else {
+ list->last->next = node;
+ list->last = node;
+ }
+}
+
+static inline void wq_node_del(struct io_wq_work_list *list,
+ struct io_wq_work_node *node,
+ struct io_wq_work_node *prev)
+{
+ if (node == list->first)
+ list->first = node->next;
+ if (node == list->last)
+ list->last = prev;
+ if (prev)
+ prev->next = node->next;
+}
+
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) ((list)->first == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+ (list)->last = NULL; \
+} while (0)
+
struct io_wq_work {
- struct list_head list;
+ union {
+ struct io_wq_work_node list;
+ void *data;
+ };
void (*func)(struct io_wq_work **);
- unsigned flags;
struct files_struct *files;
+ unsigned flags;
};
#define INIT_IO_WORK(work, _func) \
do { \
+ (work)->list.next = NULL; \
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
@@ -38,9 +84,16 @@ struct io_wq_work {
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
-struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
- struct user_struct *user,
- get_work_fn *get_work, put_work_fn *put_work);
+struct io_wq_data {
+ struct mm_struct *mm;
+ struct user_struct *user;
+ struct cred *creds;
+
+ get_work_fn *get_work;
+ put_work_fn *put_work;
+};
+
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c030a92de79..ec53aa7cdc94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -69,6 +69,7 @@
#include <linux/nospec.h>
#include <linux/sizes.h>
#include <linux/hugetlb.h>
+#include <linux/highmem.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -186,6 +187,7 @@ struct io_ring_ctx {
bool compat;
bool account_mem;
bool cq_overflow_flushed;
+ bool drain_next;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -236,6 +238,8 @@ struct io_ring_ctx {
struct user_struct *user;
+ struct cred *creds;
+
/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
struct completion *completions;
@@ -278,16 +282,6 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
};
-struct sqe_submit {
- const struct io_uring_sqe *sqe;
- struct file *ring_file;
- int ring_fd;
- u32 sequence;
- bool has_user;
- bool in_async;
- bool needs_fixed_file;
-};
-
/*
* First field must be the file pointer in all the
* iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -298,12 +292,20 @@ struct io_poll_iocb {
__poll_t events;
bool done;
bool canceled;
- struct wait_queue_entry wait;
+ struct wait_queue_entry *wait;
+};
+
+struct io_timeout_data {
+ struct io_kiocb *req;
+ struct hrtimer timer;
+ struct timespec64 ts;
+ enum hrtimer_mode mode;
+ u32 seq_offset;
};
struct io_timeout {
struct file *file;
- struct hrtimer timer;
+ struct io_timeout_data *data;
};
/*
@@ -320,7 +322,12 @@ struct io_kiocb {
struct io_timeout timeout;
};
- struct sqe_submit submit;
+ const struct io_uring_sqe *sqe;
+ struct file *ring_file;
+ int ring_fd;
+ bool has_user;
+ bool in_async;
+ bool needs_fixed_file;
struct io_ring_ctx *ctx;
union {
@@ -333,19 +340,20 @@ struct io_kiocb {
#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
-#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
-#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
+#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
#define REQ_F_TIMEOUT 1024 /* timeout request */
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
+#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data;
u32 result;
u32 sequence;
@@ -383,6 +391,9 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void __io_free_req(struct io_kiocb *req);
static void io_put_req(struct io_kiocb *req);
static void io_double_put_req(struct io_kiocb *req);
+static void __io_double_put_req(struct io_kiocb *req);
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void io_queue_linked_timeout(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -521,12 +532,13 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
opcode == IORING_OP_WRITE_FIXED);
}
-static inline bool io_prep_async_work(struct io_kiocb *req)
+static inline bool io_prep_async_work(struct io_kiocb *req,
+ struct io_kiocb **link)
{
bool do_hashed = false;
- if (req->submit.sqe) {
- switch (req->submit.sqe->opcode) {
+ if (req->sqe) {
+ switch (req->sqe->opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
do_hashed = true;
@@ -537,6 +549,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
case IORING_OP_RECVMSG:
case IORING_OP_ACCEPT:
case IORING_OP_POLL_ADD:
+ case IORING_OP_CONNECT:
/*
* We know REQ_F_ISREG is not set on some of these
* opcodes, but this enables us to keep the check in
@@ -546,17 +559,21 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
- if (io_sqe_needs_user(req->submit.sqe))
+ if (io_sqe_needs_user(req->sqe))
req->work.flags |= IO_WQ_WORK_NEEDS_USER;
}
+ *link = io_prep_linked_timeout(req);
return do_hashed;
}
static inline void io_queue_async_work(struct io_kiocb *req)
{
- bool do_hashed = io_prep_async_work(req);
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *link;
+ bool do_hashed;
+
+ do_hashed = io_prep_async_work(req, &link);
trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
req->flags);
@@ -566,13 +583,16 @@ static inline void io_queue_async_work(struct io_kiocb *req)
io_wq_enqueue_hashed(ctx->io_wq, &req->work,
file_inode(req->file));
}
+
+ if (link)
+ io_queue_linked_timeout(link);
}
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
@@ -601,11 +621,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) {
- if (req->flags & REQ_F_SHADOW_DRAIN) {
- /* Just for drain, free it. */
- __io_free_req(req);
- continue;
- }
req->flags |= REQ_F_IO_DRAINED;
io_queue_async_work(req);
}
@@ -639,7 +654,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+/* Returns true if there are no backlogged entries after the flush */
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
struct io_rings *rings = ctx->rings;
struct io_uring_cqe *cqe;
@@ -649,10 +665,10 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (!force) {
if (list_empty_careful(&ctx->cq_overflow_list))
- return;
+ return true;
if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
rings->cq_ring_entries))
- return;
+ return false;
}
spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -661,6 +677,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (force)
ctx->cq_overflow_flushed = true;
+ cqe = NULL;
while (!list_empty(&ctx->cq_overflow_list)) {
cqe = io_get_cqring(ctx);
if (!cqe && !force)
@@ -688,6 +705,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
list_del(&req->list);
io_put_req(req);
}
+
+ return cqe != NULL;
}
static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -787,6 +806,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
}
got_it:
+ req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
@@ -816,6 +836,8 @@ static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ if (req->flags & REQ_F_FREE_SQE)
+ kfree(req->sqe);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
@@ -827,6 +849,8 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
+ if (req->flags & REQ_F_TIMEOUT)
+ kfree(req->timeout.data);
percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
@@ -839,7 +863,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
@@ -857,6 +881,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
struct io_kiocb *nxt;
bool wake_ev = false;
+ /* Already got next link */
+ if (req->flags & REQ_F_LINK_NEXT)
+ return;
+
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
@@ -865,31 +893,26 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
while (nxt) {
list_del_init(&nxt->list);
+
+ if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+ (nxt->flags & REQ_F_TIMEOUT)) {
+ wake_ev |= io_link_cancel_timeout(nxt);
+ nxt = list_first_entry_or_null(&req->link_list,
+ struct io_kiocb, list);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ continue;
+ }
if (!list_empty(&req->link_list)) {
INIT_LIST_HEAD(&nxt->link_list);
list_splice(&req->link_list, &nxt->link_list);
nxt->flags |= REQ_F_LINK;
}
- /*
- * If we're in async work, we can continue processing the chain
- * in this context instead of having to queue up new async work.
- */
- if (req->flags & REQ_F_LINK_TIMEOUT) {
- wake_ev = io_link_cancel_timeout(nxt);
-
- /* we dropped this link, get next */
- nxt = list_first_entry_or_null(&req->link_list,
- struct io_kiocb, list);
- } else if (nxtptr && io_wq_current_is_worker()) {
- *nxtptr = nxt;
- break;
- } else {
- io_queue_async_work(nxt);
- break;
- }
+ *nxtptr = nxt;
+ break;
}
+ req->flags |= REQ_F_LINK_NEXT;
if (wake_ev)
io_cqring_ev_posted(ctx);
}
@@ -912,12 +935,13 @@ static void io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+ link->sqe->opcode == IORING_OP_LINK_TIMEOUT) {
io_link_cancel_timeout(link);
} else {
io_cqring_fill_event(link, -ECANCELED);
- io_double_put_req(link);
+ __io_double_put_req(link);
}
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
}
io_commit_cqring(ctx);
@@ -925,12 +949,10 @@ static void io_fail_links(struct io_kiocb *req)
io_cqring_ev_posted(ctx);
}
-static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
- if (likely(!(req->flags & REQ_F_LINK))) {
- __io_free_req(req);
+ if (likely(!(req->flags & REQ_F_LINK)))
return;
- }
/*
* If LINK is set, we have dependent requests in this chain. If we
@@ -956,32 +978,30 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
} else {
io_req_link_next(req, nxt);
}
-
- __io_free_req(req);
}
static void io_free_req(struct io_kiocb *req)
{
- io_free_req_find_next(req, NULL);
+ struct io_kiocb *nxt = NULL;
+
+ io_req_find_next(req, &nxt);
+ __io_free_req(req);
+
+ if (nxt)
+ io_queue_async_work(nxt);
}
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
*/
+__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
- struct io_kiocb *nxt = NULL;
+ io_req_find_next(req, nxtptr);
if (refcount_dec_and_test(&req->refs))
- io_free_req_find_next(req, &nxt);
-
- if (nxt) {
- if (nxtptr)
- *nxtptr = nxt;
- else
- io_queue_async_work(nxt);
- }
+ __io_free_req(req);
}
static void io_put_req(struct io_kiocb *req)
@@ -990,13 +1010,24 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static void io_double_put_req(struct io_kiocb *req)
+/*
+ * Must only be used if we don't need to care about links, usually from
+ * within the completion handling itself.
+ */
+static void __io_double_put_req(struct io_kiocb *req)
{
/* drop both submit and complete references */
if (refcount_sub_and_test(2, &req->refs))
__io_free_req(req);
}
+static void io_double_put_req(struct io_kiocb *req)
+{
+ /* drop both submit and complete references */
+ if (refcount_sub_and_test(2, &req->refs))
+ io_free_req(req);
+}
+
static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{
struct io_rings *rings = ctx->rings;
@@ -1048,7 +1079,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
- if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+ if (((req->flags &
+ (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
@@ -1366,7 +1398,7 @@ static bool io_file_supports_async(struct file *file)
static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
{
- const struct io_uring_sqe *sqe = req->submit.sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw;
unsigned ioprio;
@@ -1453,15 +1485,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
bool in_async)
{
- if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw)
+ if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
*nxt = __io_complete_rw(kiocb, ret);
else
io_rw_done(kiocb, ret);
}
-static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
- const struct io_uring_sqe *sqe,
- struct iov_iter *iter)
+static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
+ const struct io_uring_sqe *sqe,
+ struct iov_iter *iter)
{
size_t len = READ_ONCE(sqe->len);
struct io_mapped_ubuf *imu;
@@ -1533,11 +1565,10 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
return len;
}
-static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
- const struct sqe_submit *s, struct iovec **iovec,
- struct iov_iter *iter)
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter)
{
- const struct io_uring_sqe *sqe = s->sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
size_t sqe_len = READ_ONCE(sqe->len);
u8 opcode;
@@ -1551,18 +1582,16 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
* flag.
*/
opcode = READ_ONCE(sqe->opcode);
- if (opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED) {
- ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
- return ret;
+ return io_import_fixed(req->ctx, rw, sqe, iter);
}
- if (!s->has_user)
+ if (!req->has_user)
return -EFAULT;
#ifdef CONFIG_COMPAT
- if (ctx->compat)
+ if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
iovec, iter);
#endif
@@ -1590,9 +1619,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return -EAGAIN;
while (iov_iter_count(iter)) {
- struct iovec iovec = iov_iter_iovec(iter);
+ struct iovec iovec;
ssize_t nr;
+ if (!iov_iter_is_bvec(iter)) {
+ iovec = iov_iter_iovec(iter);
+ } else {
+ /* fixed buffers import bvec */
+ iovec.iov_base = kmap(iter->bvec->bv_page)
+ + iter->iov_offset;
+ iovec.iov_len = min(iter->count,
+ iter->bvec->bv_len - iter->iov_offset);
+ }
+
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
iovec.iov_len, &kiocb->ki_pos);
@@ -1601,6 +1640,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
iovec.iov_len, &kiocb->ki_pos);
}
+ if (iov_iter_is_bvec(iter))
+ kunmap(iter->bvec->bv_page);
+
if (nr < 0) {
if (!ret)
ret = nr;
@@ -1633,7 +1675,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter);
+ ret = io_import_iovec(READ, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1665,7 +1707,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN)
- kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+ kiocb_done(kiocb, ret2, nxt, req->in_async);
else
ret = -EAGAIN;
}
@@ -1691,7 +1733,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
if (unlikely(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
- ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter);
+ ret = io_import_iovec(WRITE, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1728,7 +1770,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
else
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
if (!force_nonblock || ret2 != -EAGAIN)
- kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+ kiocb_done(kiocb, ret2, nxt, req->in_async);
else
ret = -EAGAIN;
}
@@ -1918,7 +1960,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ if (sqe->ioprio || sqe->len || sqe->buf_index)
return -EINVAL;
addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
@@ -1943,6 +1985,38 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
#endif
}
+static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct sockaddr __user *addr;
+ unsigned file_flags;
+ int addr_len, ret;
+
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+ return -EINVAL;
+
+ addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+ addr_len = READ_ONCE(sqe->addr2);
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+ ret = __sys_connect_file(req->file, addr, addr_len, file_flags);
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static inline void io_poll_remove_req(struct io_kiocb *req)
{
if (!RB_EMPTY_NODE(&req->rb_node)) {
@@ -1957,8 +2031,8 @@ static void io_poll_remove_one(struct io_kiocb *req)
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
- if (!list_empty(&poll->wait.entry)) {
- list_del_init(&poll->wait.entry);
+ if (!list_empty(&poll->wait->entry)) {
+ list_del_init(&poll->wait->entry);
io_queue_async_work(req);
}
spin_unlock(&poll->head->lock);
@@ -2026,12 +2100,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
req->poll.done = true;
- io_cqring_fill_event(req, mangle_poll(mask));
+ kfree(req->poll.wait);
+ if (error)
+ io_cqring_fill_event(req, error);
+ else
+ io_cqring_fill_event(req, mangle_poll(mask));
io_commit_cqring(ctx);
}
@@ -2044,11 +2122,16 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt = NULL;
__poll_t mask = 0;
+ int ret = 0;
- if (work->flags & IO_WQ_WORK_CANCEL)
+ if (work->flags & IO_WQ_WORK_CANCEL) {
WRITE_ONCE(poll->canceled, true);
+ ret = -ECANCELED;
+ } else if (READ_ONCE(poll->canceled)) {
+ ret = -ECANCELED;
+ }
- if (!READ_ONCE(poll->canceled))
+ if (ret != -ECANCELED)
mask = vfs_poll(poll->file, &pt) & poll->events;
/*
@@ -2059,17 +2142,19 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->completion_lock);
- if (!mask && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
+ if (!mask && ret != -ECANCELED) {
+ add_wait_queue(poll->head, poll->wait);
spin_unlock_irq(&ctx->completion_lock);
return;
}
io_poll_remove_req(req);
- io_poll_complete(req, mask);
+ io_poll_complete(req, mask, ret);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
+ if (ret < 0 && req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_put_req_find_next(req, &nxt);
if (nxt)
*workptr = &nxt->work;
@@ -2078,8 +2163,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
- struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
- wait);
+ struct io_poll_iocb *poll = wait->private;
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
@@ -2089,7 +2173,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & poll->events))
return 0;
- list_del_init(&poll->wait.entry);
+ list_del_init(&poll->wait->entry);
/*
* Run completion inline if we can. We're using trylock here because
@@ -2099,7 +2183,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
*/
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
io_poll_remove_req(req);
- io_poll_complete(req, mask);
+ io_poll_complete(req, mask, 0);
req->flags |= REQ_F_COMP_LOCKED;
io_put_req(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -2130,7 +2214,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
pt->error = 0;
pt->req->poll.head = head;
- add_wait_queue(head, &pt->req->poll.wait);
+ add_wait_queue(head, pt->req->poll.wait);
}
static void io_poll_req_insert(struct io_kiocb *req)
@@ -2169,7 +2253,11 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (!poll->file)
return -EBADF;
- req->submit.sqe = NULL;
+ poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL);
+ if (!poll->wait)
+ return -ENOMEM;
+
+ req->sqe = NULL;
INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
@@ -2185,8 +2273,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+ INIT_LIST_HEAD(&poll->wait->entry);
+ init_waitqueue_func_entry(poll->wait, io_poll_wake);
+ poll->wait->private = poll;
INIT_LIST_HEAD(&req->list);
@@ -2195,14 +2284,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
spin_lock_irq(&ctx->completion_lock);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait.entry))) {
+ if (unlikely(list_empty(&poll->wait->entry))) {
if (ipt.error)
cancel = true;
ipt.error = 0;
mask = 0;
}
if (mask || ipt.error)
- list_del_init(&poll->wait.entry);
+ list_del_init(&poll->wait->entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
@@ -2211,7 +2300,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- io_poll_complete(req, mask);
+ io_poll_complete(req, mask, 0);
}
spin_unlock_irq(&ctx->completion_lock);
@@ -2224,12 +2313,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
- struct io_ring_ctx *ctx;
- struct io_kiocb *req;
+ struct io_timeout_data *data = container_of(timer,
+ struct io_timeout_data, timer);
+ struct io_kiocb *req = data->req;
+ struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
- req = container_of(timer, struct io_kiocb, timeout.timer);
- ctx = req->ctx;
atomic_inc(&ctx->cq_timeouts);
spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -2279,10 +2368,12 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (ret == -ENOENT)
return ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret == -1)
return -EALREADY;
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_cqring_fill_event(req, -ECANCELED);
io_put_req(req);
return 0;
@@ -2319,34 +2410,54 @@ static int io_timeout_remove(struct io_kiocb *req,
return 0;
}
-static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_timeout_setup(struct io_kiocb *req)
{
- unsigned count;
- struct io_ring_ctx *ctx = req->ctx;
- struct list_head *entry;
- enum hrtimer_mode mode;
- struct timespec64 ts;
- unsigned span = 0;
+ const struct io_uring_sqe *sqe = req->sqe;
+ struct io_timeout_data *data;
unsigned flags;
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1)
+ if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL;
- if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
+ data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->req = req;
+ req->timeout.data = data;
+ req->flags |= REQ_F_TIMEOUT;
+
+ if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
if (flags & IORING_TIMEOUT_ABS)
- mode = HRTIMER_MODE_ABS;
+ data->mode = HRTIMER_MODE_ABS;
else
- mode = HRTIMER_MODE_REL;
+ data->mode = HRTIMER_MODE_REL;
- hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
- req->flags |= REQ_F_TIMEOUT;
+ hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+ return 0;
+}
+
+static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ unsigned count;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_timeout_data *data;
+ struct list_head *entry;
+ unsigned span = 0;
+ int ret;
+
+ ret = io_timeout_setup(req);
+ /* common setup allows flags (like links) set, we don't */
+ if (!ret && sqe->flags)
+ ret = -EINVAL;
+ if (ret)
+ return ret;
/*
* sqe->off holds how many events that need to occur for this
@@ -2362,8 +2473,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
req->sequence = ctx->cached_sq_head + count - 1;
- /* reuse it to store the count */
- req->submit.sequence = count;
+ req->timeout.data->seq_offset = count;
/*
* Insertion sort, ensuring the first entry in the list is always
@@ -2374,6 +2484,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
unsigned nxt_sq_head;
long long tmp, tmp_nxt;
+ u32 nxt_offset = nxt->timeout.data->seq_offset;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue;
@@ -2383,8 +2494,8 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
* long to store it.
*/
tmp = (long long)ctx->cached_sq_head + count - 1;
- nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
- tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
+ nxt_sq_head = nxt->sequence - nxt_offset + 1;
+ tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
/*
* cached_sq_head may overflow, and it will never overflow twice
@@ -2406,8 +2517,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->sequence -= span;
add:
list_add(&req->list, entry);
- req->timeout.timer.function = io_timeout_fn;
- hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
+ data = req->timeout.data;
+ data->timer.function = io_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->completion_lock);
return 0;
}
@@ -2442,7 +2554,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
struct io_kiocb *req, __u64 sqe_addr,
- struct io_kiocb **nxt)
+ struct io_kiocb **nxt, int success_ret)
{
unsigned long flags;
int ret;
@@ -2459,6 +2571,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
goto done;
ret = io_poll_cancel(ctx, sqe_addr);
done:
+ if (!ret)
+ ret = success_ret;
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -2480,13 +2594,12 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
sqe->cancel_flags)
return -EINVAL;
- io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL);
+ io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0);
return 0;
}
static int io_req_defer(struct io_kiocb *req)
{
- const struct io_uring_sqe *sqe = req->submit.sqe;
struct io_uring_sqe *sqe_copy;
struct io_ring_ctx *ctx = req->ctx;
@@ -2505,34 +2618,35 @@ static int io_req_defer(struct io_kiocb *req)
return 0;
}
- memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
- req->submit.sqe = sqe_copy;
+ memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
+ req->flags |= REQ_F_FREE_SQE;
+ req->sqe = sqe_copy;
- trace_io_uring_defer(ctx, req, false);
+ trace_io_uring_defer(ctx, req, req->user_data);
list_add_tail(&req->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
return -EIOCBQUEUED;
}
-static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+__attribute__((nonnull))
+static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
{
int ret, opcode;
- struct sqe_submit *s = &req->submit;
struct io_ring_ctx *ctx = req->ctx;
- opcode = READ_ONCE(s->sqe->opcode);
+ opcode = READ_ONCE(req->sqe->opcode);
switch (opcode) {
case IORING_OP_NOP:
ret = io_nop(req);
break;
case IORING_OP_READV:
- if (unlikely(s->sqe->buf_index))
+ if (unlikely(req->sqe->buf_index))
return -EINVAL;
ret = io_read(req, nxt, force_nonblock);
break;
case IORING_OP_WRITEV:
- if (unlikely(s->sqe->buf_index))
+ if (unlikely(req->sqe->buf_index))
return -EINVAL;
ret = io_write(req, nxt, force_nonblock);
break;
@@ -2543,34 +2657,37 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
ret = io_write(req, nxt, force_nonblock);
break;
case IORING_OP_FSYNC:
- ret = io_fsync(req, s->sqe, nxt, force_nonblock);
+ ret = io_fsync(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_POLL_ADD:
- ret = io_poll_add(req, s->sqe, nxt);
+ ret = io_poll_add(req, req->sqe, nxt);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req, s->sqe);
+ ret = io_poll_remove(req, req->sqe);
break;
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock);
+ ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, s->sqe, nxt, force_nonblock);
+ ret = io_sendmsg(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, s->sqe, nxt, force_nonblock);
+ ret = io_recvmsg(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_TIMEOUT:
- ret = io_timeout(req, s->sqe);
+ ret = io_timeout(req, req->sqe);
break;
case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove(req, s->sqe);
+ ret = io_timeout_remove(req, req->sqe);
break;
case IORING_OP_ACCEPT:
- ret = io_accept(req, s->sqe, nxt, force_nonblock);
+ ret = io_accept(req, req->sqe, nxt, force_nonblock);
+ break;
+ case IORING_OP_CONNECT:
+ ret = io_connect(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel(req, s->sqe, nxt);
+ ret = io_async_cancel(req, req->sqe, nxt);
break;
default:
ret = -EINVAL;
@@ -2585,22 +2702,29 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
return -EAGAIN;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (s->in_async)
+ if (req->in_async)
mutex_lock(&ctx->uring_lock);
io_iopoll_req_issued(req);
- if (s->in_async)
+ if (req->in_async)
mutex_unlock(&ctx->uring_lock);
}
return 0;
}
+static void io_link_work_cb(struct io_wq_work **workptr)
+{
+ struct io_wq_work *work = *workptr;
+ struct io_kiocb *link = work->data;
+
+ io_queue_linked_timeout(link);
+ work->func = io_wq_submit_work;
+}
+
static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct sqe_submit *s = &req->submit;
- const struct io_uring_sqe *sqe = s->sqe;
struct io_kiocb *nxt = NULL;
int ret = 0;
@@ -2611,10 +2735,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
ret = -ECANCELED;
if (!ret) {
- s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
- s->in_async = true;
+ req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
+ req->in_async = true;
do {
- ret = __io_submit_sqe(req, &nxt, false);
+ ret = io_issue_sqe(req, &nxt, false);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -2636,13 +2760,17 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
io_put_req(req);
}
- /* async context always use a copy of the sqe */
- kfree(sqe);
-
/* if a dependent link is ready, pass it back */
if (!ret && nxt) {
- io_prep_async_work(nxt);
+ struct io_kiocb *link;
+
+ io_prep_async_work(nxt, &link);
*workptr = &nxt->work;
+ if (link) {
+ nxt->work.flags |= IO_WQ_WORK_CB;
+ nxt->work.func = io_link_work_cb;
+ nxt->work.data = link;
+ }
}
}
@@ -2674,24 +2802,17 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
{
- struct sqe_submit *s = &req->submit;
struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
int fd;
- flags = READ_ONCE(s->sqe->flags);
- fd = READ_ONCE(s->sqe->fd);
+ flags = READ_ONCE(req->sqe->flags);
+ fd = READ_ONCE(req->sqe->fd);
if (flags & IOSQE_IO_DRAIN)
req->flags |= REQ_F_IO_DRAIN;
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = s->sequence;
- if (!io_op_needs_file(s->sqe))
+ if (!io_op_needs_file(req->sqe))
return 0;
if (flags & IOSQE_FIXED_FILE) {
@@ -2704,7 +2825,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
return -EBADF;
req->flags |= REQ_F_FIXED_FILE;
} else {
- if (s->needs_fixed_file)
+ if (req->needs_fixed_file)
return -EBADF;
trace_io_uring_file_get(ctx, fd);
req->file = io_file_get(state, fd);
@@ -2728,7 +2849,7 @@ static int io_grab_files(struct io_kiocb *req)
* the fd has changed since we started down this path, and disallow
* this operation if it has.
*/
- if (fcheck(req->submit.ring_fd) == req->submit.ring_file) {
+ if (fcheck(req->ring_fd) == req->ring_file) {
list_add(&req->inflight_entry, &ctx->inflight_list);
req->flags |= REQ_F_INFLIGHT;
req->work.files = current->files;
@@ -2742,8 +2863,9 @@ static int io_grab_files(struct io_kiocb *req)
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
- struct io_kiocb *req = container_of(timer, struct io_kiocb,
- timeout.timer);
+ struct io_timeout_data *data = container_of(timer,
+ struct io_timeout_data, timer);
+ struct io_kiocb *req = data->req;
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *prev = NULL;
unsigned long flags;
@@ -2756,16 +2878,20 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
*/
if (!list_empty(&req->list)) {
prev = list_entry(req->list.prev, struct io_kiocb, link_list);
- if (refcount_inc_not_zero(&prev->refs))
+ if (refcount_inc_not_zero(&prev->refs)) {
list_del_init(&req->list);
- else
+ prev->flags &= ~REQ_F_LINK_TIMEOUT;
+ } else
prev = NULL;
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
- io_async_find_and_cancel(ctx, req, prev->user_data, NULL);
+ if (prev->flags & REQ_F_LINK)
+ prev->flags |= REQ_F_FAIL_LINK;
+ io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
+ -ETIME);
io_put_req(prev);
} else {
io_cqring_add_event(req, -ETIME);
@@ -2774,8 +2900,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
- enum hrtimer_mode *mode)
+static void io_queue_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2785,9 +2910,11 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
*/
spin_lock_irq(&ctx->completion_lock);
if (!list_empty(&req->list)) {
- req->timeout.timer.function = io_link_timeout_fn;
- hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts),
- *mode);
+ struct io_timeout_data *data = req->timeout.data;
+
+ data->timer.function = io_link_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
+ data->mode);
}
spin_unlock_irq(&ctx->completion_lock);
@@ -2795,66 +2922,30 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
io_put_req(req);
}
-static int io_validate_link_timeout(const struct io_uring_sqe *sqe,
- struct timespec64 *ts)
-{
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
- return -EINVAL;
- if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
- return -EINVAL;
- if (get_timespec64(ts, u64_to_user_ptr(sqe->addr)))
- return -EFAULT;
-
- return 0;
-}
-
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
- struct timespec64 *ts,
- enum hrtimer_mode *mode)
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- int ret;
if (!(req->flags & REQ_F_LINK))
return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
- if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
+ if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
- ret = io_validate_link_timeout(nxt->submit.sqe, ts);
- if (ret) {
- list_del_init(&nxt->list);
- io_cqring_add_event(nxt, ret);
- io_double_put_req(nxt);
- return ERR_PTR(-ECANCELED);
- }
-
- if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS)
- *mode = HRTIMER_MODE_ABS;
- else
- *mode = HRTIMER_MODE_REL;
-
req->flags |= REQ_F_LINK_TIMEOUT;
- hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode);
return nxt;
}
-static int __io_queue_sqe(struct io_kiocb *req)
+static void __io_queue_sqe(struct io_kiocb *req)
{
- enum hrtimer_mode mode;
- struct io_kiocb *nxt;
- struct timespec64 ts;
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *nxt = NULL;
int ret;
- nxt = io_prep_linked_timeout(req, &ts, &mode);
- if (IS_ERR(nxt)) {
- ret = PTR_ERR(nxt);
- nxt = NULL;
- goto err;
- }
-
- ret = __io_submit_sqe(req, NULL, true);
+ ret = io_issue_sqe(req, &nxt, true);
+ if (nxt)
+ io_queue_async_work(nxt);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2862,42 +2953,38 @@ static int __io_queue_sqe(struct io_kiocb *req)
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
- struct sqe_submit *s = &req->submit;
struct io_uring_sqe *sqe_copy;
- sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
- if (sqe_copy) {
- s->sqe = sqe_copy;
- if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
- ret = io_grab_files(req);
- if (ret) {
- kfree(sqe_copy);
- goto err;
- }
- }
-
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req);
+ sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+ if (!sqe_copy)
+ goto err;
- if (nxt)
- io_queue_linked_timeout(nxt, &ts, &mode);
+ req->sqe = sqe_copy;
+ req->flags |= REQ_F_FREE_SQE;
- return 0;
+ if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+ ret = io_grab_files(req);
+ if (ret)
+ goto err;
}
+
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req);
+ return;
}
err:
/* drop submission reference */
io_put_req(req);
- if (nxt) {
+ if (linked_timeout) {
if (!ret)
- io_queue_linked_timeout(nxt, &ts, &mode);
+ io_queue_linked_timeout(linked_timeout);
else
- io_put_req(nxt);
+ io_put_req(linked_timeout);
}
/* and drop final reference, if we failed */
@@ -2907,83 +2994,52 @@ err:
req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
}
-
- return ret;
}
-static int io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe(struct io_kiocb *req)
{
int ret;
- ret = io_req_defer(req);
- if (ret) {
- if (ret != -EIOCBQUEUED) {
- io_cqring_add_event(req, ret);
- io_double_put_req(req);
- }
- return 0;
+ if (unlikely(req->ctx->drain_next)) {
+ req->flags |= REQ_F_IO_DRAIN;
+ req->ctx->drain_next = false;
}
+ req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
- return __io_queue_sqe(req);
-}
-
-static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
-{
- int ret;
- int need_submit = false;
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!shadow)
- return io_queue_sqe(req);
-
- /*
- * Mark the first IO in link list as DRAIN, let all the following
- * IOs enter the defer list. all IO needs to be completed before link
- * list.
- */
- req->flags |= REQ_F_IO_DRAIN;
ret = io_req_defer(req);
if (ret) {
if (ret != -EIOCBQUEUED) {
io_cqring_add_event(req, ret);
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_double_put_req(req);
- __io_free_req(shadow);
- return 0;
}
- } else {
- /*
- * If ret == 0 means that all IOs in front of link io are
- * running done. let's queue link head.
- */
- need_submit = true;
- }
-
- /* Insert shadow req to defer_list, blocking next IOs */
- spin_lock_irq(&ctx->completion_lock);
- trace_io_uring_defer(ctx, shadow, true);
- list_add_tail(&shadow->list, &ctx->defer_list);
- spin_unlock_irq(&ctx->completion_lock);
-
- if (need_submit)
- return __io_queue_sqe(req);
+ } else
+ __io_queue_sqe(req);
+}
- return 0;
+static inline void io_queue_link_head(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+ io_cqring_add_event(req, -ECANCELED);
+ io_double_put_req(req);
+ } else
+ io_queue_sqe(req);
}
+
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
struct io_kiocb **link)
{
- struct io_uring_sqe *sqe_copy;
- struct sqe_submit *s = &req->submit;
struct io_ring_ctx *ctx = req->ctx;
int ret;
- req->user_data = s->sqe->user_data;
+ req->user_data = req->sqe->user_data;
/* enforce forwards compatibility on users */
- if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
+ if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
goto err_req;
}
@@ -3005,25 +3061,37 @@ err_req:
*/
if (*link) {
struct io_kiocb *prev = *link;
+ struct io_uring_sqe *sqe_copy;
- sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+ if (req->sqe->flags & IOSQE_IO_DRAIN)
+ (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+
+ if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+ ret = io_timeout_setup(req);
+ /* common setup allows offset being set, we don't */
+ if (!ret && req->sqe->off)
+ ret = -EINVAL;
+ if (ret) {
+ prev->flags |= REQ_F_FAIL_LINK;
+ goto err_req;
+ }
+ }
+
+ sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
if (!sqe_copy) {
ret = -EAGAIN;
goto err_req;
}
- s->sqe = sqe_copy;
+ req->sqe = sqe_copy;
+ req->flags |= REQ_F_FREE_SQE;
trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->list, &prev->link_list);
- } else if (s->sqe->flags & IOSQE_IO_LINK) {
+ } else if (req->sqe->flags & IOSQE_IO_LINK) {
req->flags |= REQ_F_LINK;
INIT_LIST_HEAD(&req->link_list);
*link = req;
- } else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
- /* Only valid as a linked SQE */
- ret = -EINVAL;
- goto err_req;
} else {
io_queue_sqe(req);
}
@@ -3075,7 +3143,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
{
struct io_rings *rings = ctx->rings;
u32 *sq_array = ctx->sq_array;
@@ -3091,14 +3159,18 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
*/
head = ctx->cached_sq_head;
/* make sure SQ entry isn't read before tail */
- if (head == smp_load_acquire(&rings->sq.tail))
+ if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
return false;
head = READ_ONCE(sq_array[head & ctx->sq_mask]);
- if (head < ctx->sq_entries) {
- s->ring_file = NULL;
- s->sqe = &ctx->sq_sqes[head];
- s->sequence = ctx->cached_sq_head;
+ if (likely(head < ctx->sq_entries)) {
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = ctx->cached_sq_head;
+ req->sqe = &ctx->sq_sqes[head];
ctx->cached_sq_head++;
return true;
}
@@ -3116,14 +3188,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
- struct io_kiocb *shadow_req = NULL;
int i, submitted = 0;
bool mm_fault = false;
- if (!list_empty(&ctx->cq_overflow_list)) {
- io_cqring_overflow_flush(ctx, false);
+ /* if we have a backlog and couldn't flush it all, return BUSY */
+ if (!list_empty(&ctx->cq_overflow_list) &&
+ !io_cqring_overflow_flush(ctx, false))
return -EBUSY;
- }
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, nr);
@@ -3140,12 +3211,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
submitted = -EAGAIN;
break;
}
- if (!io_get_sqring(ctx, &req->submit)) {
+ if (!io_get_sqring(ctx, req)) {
__io_free_req(req);
break;
}
- if (io_sqe_needs_user(req->submit.sqe) && !*mm) {
+ if (io_sqe_needs_user(req->sqe) && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) {
use_mm(ctx->sqo_mm);
@@ -3153,26 +3224,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
}
- sqe_flags = req->submit.sqe->flags;
-
- if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
- if (!shadow_req) {
- shadow_req = io_get_req(ctx, NULL);
- if (unlikely(!shadow_req))
- goto out;
- shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
- refcount_dec(&shadow_req->refs);
- }
- shadow_req->sequence = req->submit.sequence;
- }
+ sqe_flags = req->sqe->flags;
-out:
- req->submit.ring_file = ring_file;
- req->submit.ring_fd = ring_fd;
- req->submit.has_user = *mm != NULL;
- req->submit.in_async = async;
- req->submit.needs_fixed_file = async;
- trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data,
+ req->ring_file = ring_file;
+ req->ring_fd = ring_fd;
+ req->has_user = *mm != NULL;
+ req->in_async = async;
+ req->needs_fixed_file = async;
+ trace_io_uring_submit_sqe(ctx, req->sqe->user_data,
true, async);
io_submit_sqe(req, statep, &link);
submitted++;
@@ -3182,14 +3241,13 @@ out:
* that's the end of the chain. Submit the previous link.
*/
if (!(sqe_flags & IOSQE_IO_LINK) && link) {
- io_queue_link_head(link, shadow_req);
+ io_queue_link_head(link);
link = NULL;
- shadow_req = NULL;
}
}
if (link)
- io_queue_link_head(link, shadow_req);
+ io_queue_link_head(link);
if (statep)
io_submit_state_end(&state);
@@ -3203,6 +3261,7 @@ static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
struct mm_struct *cur_mm = NULL;
+ const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned inflight;
@@ -3213,6 +3272,7 @@ static int io_sq_thread(void *data)
old_fs = get_fs();
set_fs(USER_DS);
+ old_cred = override_creds(ctx->creds);
ret = timeout = inflight = 0;
while (!kthread_should_park()) {
@@ -3319,6 +3379,7 @@ static int io_sq_thread(void *data)
unuse_mm(cur_mm);
mmput(cur_mm);
}
+ revert_creds(old_cred);
kthread_parkme();
@@ -3898,6 +3959,7 @@ static void io_get_work(struct io_wq_work *work)
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
+ struct io_wq_data data;
unsigned concurrency;
int ret;
@@ -3942,10 +4004,15 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err;
}
+ data.mm = ctx->sqo_mm;
+ data.user = ctx->user;
+ data.creds = ctx->creds;
+ data.get_work = io_get_work;
+ data.put_work = io_put_work;
+
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user,
- io_get_work, io_put_work);
+ ctx->io_wq = io_wq_create(concurrency, &data);
if (IS_ERR(ctx->io_wq)) {
ret = PTR_ERR(ctx->io_wq);
ctx->io_wq = NULL;
@@ -4294,6 +4361,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_unaccount_mem(ctx->user,
ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
+ put_cred(ctx->creds);
kfree(ctx->completions);
kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx);
@@ -4402,12 +4470,11 @@ static int io_uring_flush(struct file *file, void *data)
return 0;
}
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static void *io_uring_validate_mmap_request(struct file *file,
+ loff_t pgoff, size_t sz)
{
- loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
- unsigned long sz = vma->vm_end - vma->vm_start;
struct io_ring_ctx *ctx = file->private_data;
- unsigned long pfn;
+ loff_t offset = pgoff << PAGE_SHIFT;
struct page *page;
void *ptr;
@@ -4420,17 +4487,59 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
ptr = ctx->sq_sqes;
break;
default:
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
page = virt_to_head_page(ptr);
if (sz > page_size(page))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
+
+ return ptr;
+}
+
+#ifdef CONFIG_MMU
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t sz = vma->vm_end - vma->vm_start;
+ unsigned long pfn;
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
+#else /* !CONFIG_MMU */
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
+}
+
+static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+}
+
+static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, pgoff, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ return (unsigned long) ptr;
+}
+
+#endif /* !CONFIG_MMU */
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const sigset_t __user *, sig,
size_t, sigsz)
@@ -4501,6 +4610,10 @@ static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.flush = io_uring_flush,
.mmap = io_uring_mmap,
+#ifndef CONFIG_MMU
+ .get_unmapped_area = io_uring_nommu_get_unmapped_area,
+ .mmap_capabilities = io_uring_nommu_mmap_capabilities,
+#endif
.poll = io_uring_poll,
.fasync = io_uring_fasync,
};
@@ -4531,12 +4644,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
- if (size == SIZE_MAX)
+ if (size == SIZE_MAX) {
+ io_mem_free(ctx->rings);
+ ctx->rings = NULL;
return -EOVERFLOW;
+ }
ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes)
+ if (!ctx->sq_sqes) {
+ io_mem_free(ctx->rings);
+ ctx->rings = NULL;
return -ENOMEM;
+ }
return 0;
}
@@ -4640,6 +4759,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
ctx->compat = in_compat_syscall();
ctx->account_mem = account_mem;
ctx->user = user;
+ ctx->creds = prepare_creds();
ret = io_allocate_scq_urings(ctx, p);
if (ret)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index fef3a6bf7c78..812061ba667a 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -8,6 +8,7 @@
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
+#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
@@ -174,10 +175,9 @@ static int fiemap_check_ranges(struct super_block *sb,
return 0;
}
-static int ioctl_fiemap(struct file *filp, unsigned long arg)
+static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
{
struct fiemap fiemap;
- struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
struct fiemap_extent_info fieinfo = { 0, };
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
@@ -244,7 +244,8 @@ fdput:
return ret;
}
-static long ioctl_file_clone_range(struct file *file, void __user *argp)
+static long ioctl_file_clone_range(struct file *file,
+ struct file_clone_range __user *argp)
{
struct file_clone_range args;
@@ -490,6 +491,35 @@ int ioctl_preallocate(struct file *filp, void __user *argp)
return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
}
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
+/* just account for different alignment */
+int compat_ioctl_preallocate(struct file *file,
+ struct space_resv_32 __user *argp)
+{
+ struct inode *inode = file_inode(file);
+ struct space_resv_32 sr;
+
+ if (copy_from_user(&sr, argp, sizeof(sr)))
+ return -EFAULT;
+
+ switch (sr.l_whence) {
+ case SEEK_SET:
+ break;
+ case SEEK_CUR:
+ sr.l_start += file->f_pos;
+ break;
+ case SEEK_END:
+ sr.l_start += i_size_read(inode);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return vfs_fallocate(file, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
+#endif
+
static int file_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -584,9 +614,9 @@ static int ioctl_fsthaw(struct file *filp)
return thaw_super(sb);
}
-static int ioctl_file_dedupe_range(struct file *file, void __user *arg)
+static int ioctl_file_dedupe_range(struct file *file,
+ struct file_dedupe_range __user *argp)
{
- struct file_dedupe_range __user *argp = arg;
struct file_dedupe_range *same = NULL;
int ret;
unsigned long size;
@@ -635,7 +665,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
unsigned long arg)
{
int error = 0;
- int __user *argp = (int __user *)arg;
+ void __user *argp = (void __user *)arg;
struct inode *inode = file_inode(filp);
switch (cmd) {
@@ -674,13 +704,13 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
break;
case FS_IOC_FIEMAP:
- return ioctl_fiemap(filp, arg);
+ return ioctl_fiemap(filp, argp);
case FIGETBSZ:
/* anon_bdev filesystems may not have a block size */
if (!inode->i_sb->s_blocksize)
return -EINVAL;
- return put_user(inode->i_sb->s_blocksize, argp);
+ return put_user(inode->i_sb->s_blocksize, (int __user *)argp);
case FICLONE:
return ioctl_file_clone(filp, arg, 0, 0, 0);
@@ -719,3 +749,37 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
return ksys_ioctl(fd, cmd, arg);
}
+
+#ifdef CONFIG_COMPAT
+/**
+ * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
+ *
+ * This is not normally called as a function, but instead set in struct
+ * file_operations as
+ *
+ * .compat_ioctl = compat_ptr_ioctl,
+ *
+ * On most architectures, the compat_ptr_ioctl() just passes all arguments
+ * to the corresponding ->ioctl handler. The exception is arch/s390, where
+ * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
+ * pointers to the second 2GB alias the first 2GB, as is the case for
+ * native 32-bit s390 user space.
+ *
+ * The compat_ptr_ioctl() function must therefore be used only with ioctl
+ * functions that either ignore the argument or pass a pointer to a
+ * compatible data type.
+ *
+ * If any ioctl command handled by fops->unlocked_ioctl passes a plain
+ * integer instead of a pointer, or any of the passed data types
+ * is incompatible between 32-bit and 64-bit architectures, a proper
+ * handler is required instead of compat_ptr_ioctl.
+ */
+long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ if (!file->f_op->unlocked_ioctl)
+ return -ENOIOCTLCMD;
+
+ return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+EXPORT_SYMBOL(compat_ptr_ioctl);
+#endif
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 93cd11938bf5..eef2722d93a1 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -3,13 +3,15 @@
# Copyright (c) 2019 Oracle.
# All Rights Reserved.
#
-obj-$(CONFIG_FS_IOMAP) += iomap.o
-iomap-y += \
- apply.o \
- buffered-io.o \
- direct-io.o \
- fiemap.o \
- seek.o
+ccflags-y += -I $(srctree)/$(src) # needed for trace events
+
+obj-$(CONFIG_FS_IOMAP) += iomap.o
+iomap-y += trace.o \
+ apply.o \
+ buffered-io.o \
+ direct-io.o \
+ fiemap.o \
+ seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 54c02aecf3cd..76925b40b5fd 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -7,6 +7,7 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/iomap.h>
+#include "trace.h"
/*
* Execute a iomap write on a segment of the mapping that spans a
@@ -23,8 +24,12 @@ loff_t
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
const struct iomap_ops *ops, void *data, iomap_actor_t actor)
{
- struct iomap iomap = { 0 };
+ struct iomap iomap = { .type = IOMAP_HOLE };
+ struct iomap srcmap = { .type = IOMAP_HOLE };
loff_t written = 0, ret;
+ u64 end;
+
+ trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_);
/*
* Need to map a range from start position for length bytes. This can
@@ -38,7 +43,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
* expose transient stale data. If the reserve fails, we can safely
* back out at this point as there is nothing to undo.
*/
- ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+ ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
if (ret)
return ret;
if (WARN_ON(iomap.offset > pos))
@@ -46,19 +51,34 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
if (WARN_ON(iomap.length == 0))
return -EIO;
+ trace_iomap_apply_dstmap(inode, &iomap);
+ if (srcmap.type != IOMAP_HOLE)
+ trace_iomap_apply_srcmap(inode, &srcmap);
+
/*
* Cut down the length to the one actually provided by the filesystem,
* as it might not be able to give us the whole size that we requested.
*/
- if (iomap.offset + iomap.length < pos + length)
- length = iomap.offset + iomap.length - pos;
+ end = iomap.offset + iomap.length;
+ if (srcmap.type != IOMAP_HOLE)
+ end = min(end, srcmap.offset + srcmap.length);
+ if (pos + length > end)
+ length = end - pos;
/*
- * Now that we have guaranteed that the space allocation will succeed.
+ * Now that we have guaranteed that the space allocation will succeed,
* we can do the copy-in page by page without having to worry about
* failures exposing transient data.
+ *
+ * To support COW operations, we read in data for partially blocks from
+ * the srcmap if the file system filled it in. In that case we the
+ * length needs to be limited to the earlier of the ends of the iomaps.
+ * If the file system did not provide a srcmap we pass in the normal
+ * iomap into the actors so that they don't need to have special
+ * handling for the two cases.
*/
- written = actor(inode, pos, length, data, &iomap);
+ written = actor(inode, pos, length, data, &iomap,
+ srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
/*
* Now the data has been copied, commit the range we've copied. This
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e25901ae3ff4..d33c7bc5ee92 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (C) 2016-2019 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -12,13 +12,34 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
+#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
+#include "trace.h"
#include "../internal.h"
+/*
+ * Structure allocated for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+ atomic_t read_count;
+ atomic_t write_count;
+ DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+ if (page_has_private(page))
+ return (struct iomap_page *)page_private(page);
+ return NULL;
+}
+
+static struct bio_set iomap_ioend_bioset;
+
static struct iomap_page *
iomap_page_create(struct inode *inode, struct page *page)
{
@@ -203,9 +224,17 @@ iomap_read_inline_data(struct inode *inode, struct page *page,
SetPageUptodate(page);
}
+static inline bool iomap_block_needs_zeroing(struct inode *inode,
+ struct iomap *iomap, loff_t pos)
+{
+ return iomap->type != IOMAP_MAPPED ||
+ (iomap->flags & IOMAP_F_NEW) ||
+ pos >= i_size_read(inode);
+}
+
static loff_t
iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
struct page *page = ctx->cur_page;
@@ -226,7 +255,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (plen == 0)
goto done;
- if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
+ if (iomap_block_needs_zeroing(inode, iomap, pos)) {
zero_user(page, poff, plen);
iomap_set_range_uptodate(page, poff, plen);
goto done;
@@ -293,6 +322,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
unsigned poff;
loff_t ret;
+ trace_iomap_readpage(page->mapping->host, 1);
+
for (poff = 0; poff < PAGE_SIZE; poff += ret) {
ret = iomap_apply(inode, page_offset(page) + poff,
PAGE_SIZE - poff, 0, ops, &ctx,
@@ -351,7 +382,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
static loff_t
iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
loff_t done, ret;
@@ -371,7 +402,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
ctx->cur_page_in_bio = false;
}
ret = iomap_readpage_actor(inode, pos + done, length - done,
- ctx, iomap);
+ ctx, iomap, srcmap);
}
return done;
@@ -389,6 +420,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
loff_t last = page_offset(list_entry(pages->next, struct page, lru));
loff_t length = last - pos + PAGE_SIZE, ret = 0;
+ trace_iomap_readpages(mapping->host, nr_pages);
+
while (length > 0) {
ret = iomap_apply(mapping->host, pos, length, 0, ops,
&ctx, iomap_readpages_actor);
@@ -455,6 +488,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
int
iomap_releasepage(struct page *page, gfp_t gfp_mask)
{
+ trace_iomap_releasepage(page->mapping->host, page, 0, 0);
+
/*
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
@@ -470,6 +505,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
void
iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
{
+ trace_iomap_invalidatepage(page->mapping->host, page, offset, len);
+
/*
* If we are invalidating the entire page, clear the dirty state from it
* and release it to avoid unnecessary buildup of the LRU.
@@ -511,6 +548,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
EXPORT_SYMBOL_GPL(iomap_migrate_page);
#endif /* CONFIG_MIGRATION */
+enum {
+ IOMAP_WRITE_F_UNSHARE = (1 << 0),
+};
+
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -525,19 +566,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
}
static int
-iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
- unsigned poff, unsigned plen, unsigned from, unsigned to,
- struct iomap *iomap)
+iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
+ unsigned plen, struct iomap *iomap)
{
struct bio_vec bvec;
struct bio bio;
- if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
- zero_user_segments(page, poff, from, to, poff + plen);
- iomap_set_range_uptodate(page, poff, plen);
- return 0;
- }
-
bio_init(&bio, &bvec, 1);
bio.bi_opf = REQ_OP_READ;
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
@@ -547,15 +581,15 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
}
static int
-__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
- struct page *page, struct iomap *iomap)
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
+ struct page *page, struct iomap *srcmap)
{
struct iomap_page *iop = iomap_page_create(inode, page);
loff_t block_size = i_blocksize(inode);
loff_t block_start = pos & ~(block_size - 1);
loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
unsigned from = offset_in_page(pos), to = from + len, poff, plen;
- int status = 0;
+ int status;
if (PageUptodate(page))
return 0;
@@ -566,29 +600,39 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
if (plen == 0)
break;
- if ((from > poff && from < poff + plen) ||
- (to > poff && to < poff + plen)) {
- status = iomap_read_page_sync(inode, block_start, page,
- poff, plen, from, to, iomap);
- if (status)
- break;
+ if (!(flags & IOMAP_WRITE_F_UNSHARE) &&
+ (from <= poff || from >= poff + plen) &&
+ (to <= poff || to >= poff + plen))
+ continue;
+
+ if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
+ if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
+ return -EIO;
+ zero_user_segments(page, poff, from, to, poff + plen);
+ iomap_set_range_uptodate(page, poff, plen);
+ continue;
}
+ status = iomap_read_page_sync(block_start, page, poff, plen,
+ srcmap);
+ if (status)
+ return status;
} while ((block_start += plen) < block_end);
- return status;
+ return 0;
}
static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, struct iomap *iomap)
+ struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
- pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status = 0;
BUG_ON(pos + len > iomap->offset + iomap->length);
+ if (srcmap != iomap)
+ BUG_ON(pos + len > srcmap->offset + srcmap->length);
if (fatal_signal_pending(current))
return -EINTR;
@@ -599,18 +643,20 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
return status;
}
- page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+ page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
+ AOP_FLAG_NOFS);
if (!page) {
status = -ENOMEM;
goto out_no_page;
}
- if (iomap->type == IOMAP_INLINE)
- iomap_read_inline_data(inode, page, iomap);
+ if (srcmap->type == IOMAP_INLINE)
+ iomap_read_inline_data(inode, page, srcmap);
else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
- status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ status = __block_write_begin_int(page, pos, len, NULL, srcmap);
else
- status = __iomap_write_begin(inode, pos, len, page, iomap);
+ status = __iomap_write_begin(inode, pos, len, flags, page,
+ srcmap);
if (unlikely(status))
goto out_unlock;
@@ -656,7 +702,7 @@ EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
static int
__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page, struct iomap *iomap)
+ unsigned copied, struct page *page)
{
flush_dcache_page(page);
@@ -696,20 +742,20 @@ iomap_write_end_inline(struct inode *inode, struct page *page,
}
static int
-iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page, struct iomap *iomap)
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
+ struct page *page, struct iomap *iomap, struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
loff_t old_size = inode->i_size;
int ret;
- if (iomap->type == IOMAP_INLINE) {
+ if (srcmap->type == IOMAP_INLINE) {
ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
- } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+ } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
page, NULL);
} else {
- ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
+ ret = __iomap_write_end(inode, pos, len, copied, page);
}
/*
@@ -736,12 +782,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
static loff_t
iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct iov_iter *i = data;
long status = 0;
ssize_t written = 0;
- unsigned int flags = AOP_FLAG_NOFS;
do {
struct page *page;
@@ -771,8 +816,8 @@ again:
break;
}
- status = iomap_write_begin(inode, pos, bytes, flags, &page,
- iomap);
+ status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
+ srcmap);
if (unlikely(status))
break;
@@ -783,8 +828,8 @@ again:
flush_dcache_page(page);
- status = iomap_write_end(inode, pos, bytes, copied, page,
- iomap);
+ status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+ srcmap);
if (unlikely(status < 0))
break;
copied = status;
@@ -835,50 +880,32 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
-static struct page *
-__iomap_read_page(struct inode *inode, loff_t offset)
-{
- struct address_space *mapping = inode->i_mapping;
- struct page *page;
-
- page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- return page;
-}
-
static loff_t
-iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap, struct iomap *srcmap)
{
long status = 0;
ssize_t written = 0;
- do {
- struct page *page, *rpage;
- unsigned long offset; /* Offset into pagecache page */
- unsigned long bytes; /* Bytes to write to page */
-
- offset = offset_in_page(pos);
- bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+ /* don't bother with blocks that are not shared to start with */
+ if (!(iomap->flags & IOMAP_F_SHARED))
+ return length;
+ /* don't bother with holes or unwritten extents */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
- rpage = __iomap_read_page(inode, pos);
- if (IS_ERR(rpage))
- return PTR_ERR(rpage);
+ do {
+ unsigned long offset = offset_in_page(pos);
+ unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+ struct page *page;
status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_NOFS, &page, iomap);
- put_page(rpage);
+ IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap);
if (unlikely(status))
return status;
- WARN_ON_ONCE(!PageUptodate(page));
-
- status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+ status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
+ srcmap);
if (unlikely(status <= 0)) {
if (WARN_ON_ONCE(status == 0))
return -EIO;
@@ -898,14 +925,14 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
int
-iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops)
{
loff_t ret;
while (len) {
ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
- iomap_dirty_actor);
+ iomap_unshare_actor);
if (ret <= 0)
return ret;
pos += ret;
@@ -914,23 +941,22 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
return 0;
}
-EXPORT_SYMBOL_GPL(iomap_file_dirty);
+EXPORT_SYMBOL_GPL(iomap_file_unshare);
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
- unsigned bytes, struct iomap *iomap)
+ unsigned bytes, struct iomap *iomap, struct iomap *srcmap)
{
struct page *page;
int status;
- status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
- iomap);
+ status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
if (status)
return status;
zero_user(page, offset, bytes);
mark_page_accessed(page);
- return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+ return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
@@ -942,14 +968,14 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
bool *did_zero = data;
loff_t written = 0;
int status;
/* already zeroed? we're done. */
- if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return count;
do {
@@ -961,7 +987,8 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
if (IS_DAX(inode))
status = iomap_dax_zero(pos, offset, bytes, iomap);
else
- status = iomap_zero(inode, pos, offset, bytes, iomap);
+ status = iomap_zero(inode, pos, offset, bytes, iomap,
+ srcmap);
if (status < 0)
return status;
@@ -1011,7 +1038,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page);
static loff_t
iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct page *page = data;
int ret;
@@ -1040,20 +1067,19 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
lock_page(page);
size = i_size_read(inode);
- if ((page->mapping != inode->i_mapping) ||
- (page_offset(page) > size)) {
+ offset = page_offset(page);
+ if (page->mapping != inode->i_mapping || offset > size) {
/* We overload EFAULT to mean page got truncated */
ret = -EFAULT;
goto out_unlock;
}
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_SHIFT) > size)
+ if (offset > size - PAGE_SIZE)
length = offset_in_page(size);
else
length = PAGE_SIZE;
- offset = page_offset(page);
while (length > 0) {
ret = iomap_apply(inode, offset, length,
IOMAP_WRITE | IOMAP_FAULT, ops, page,
@@ -1071,3 +1097,551 @@ out_unlock:
return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+static void
+iomap_finish_page_writeback(struct inode *inode, struct page *page,
+ int error)
+{
+ struct iomap_page *iop = to_iomap_page(page);
+
+ if (error) {
+ SetPageError(page);
+ mapping_set_error(inode->i_mapping, -EIO);
+ }
+
+ WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0);
+
+ if (!iop || atomic_dec_and_test(&iop->write_count))
+ end_page_writeback(page);
+}
+
+/*
+ * We're now finished for good with this ioend structure. Update the page
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
+ */
+static void
+iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ struct inode *inode = ioend->io_inode;
+ struct bio *bio = &ioend->io_inline_bio;
+ struct bio *last = ioend->io_bio, *next;
+ u64 start = bio->bi_iter.bi_sector;
+ bool quiet = bio_flagged(bio, BIO_QUIET);
+
+ for (bio = &ioend->io_inline_bio; bio; bio = next) {
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ /*
+ * For the last bio, bi_private points to the ioend, so we
+ * need to explicitly end the iteration here.
+ */
+ if (bio == last)
+ next = NULL;
+ else
+ next = bio->bi_private;
+
+ /* walk each page on bio, ending page IO on them */
+ bio_for_each_segment_all(bv, bio, iter_all)
+ iomap_finish_page_writeback(inode, bv->bv_page, error);
+ bio_put(bio);
+ }
+
+ if (unlikely(error && !quiet)) {
+ printk_ratelimited(KERN_ERR
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino, ioend->io_offset,
+ start);
+ }
+}
+
+void
+iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+ struct list_head tmp;
+
+ list_replace_init(&ioend->io_list, &tmp);
+ iomap_finish_ioend(ioend, error);
+
+ while (!list_empty(&tmp)) {
+ ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ iomap_finish_ioend(ioend, error);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool
+iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
+{
+ if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+ return false;
+ if ((ioend->io_flags & IOMAP_F_SHARED) ^
+ (next->io_flags & IOMAP_F_SHARED))
+ return false;
+ if ((ioend->io_type == IOMAP_UNWRITTEN) ^
+ (next->io_type == IOMAP_UNWRITTEN))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ return true;
+}
+
+void
+iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
+ void (*merge_private)(struct iomap_ioend *ioend,
+ struct iomap_ioend *next))
+{
+ struct iomap_ioend *next;
+
+ INIT_LIST_HEAD(&ioend->io_list);
+
+ while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+ io_list))) {
+ if (!iomap_ioend_can_merge(ioend, next))
+ break;
+ list_move_tail(&next->io_list, &ioend->io_list);
+ ioend->io_size += next->io_size;
+ if (next->io_private && merge_private)
+ merge_private(ioend, next);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int
+iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+ struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+void
+iomap_sort_ioends(struct list_head *ioend_list)
+{
+ list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
+static void iomap_writepage_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = bio->bi_private;
+
+ iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+}
+
+/*
+ * Submit the final bio for an ioend.
+ *
+ * If @error is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the bio instead of
+ * submitting it. This typically only happens on a filesystem shutdown.
+ */
+static int
+iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
+ int error)
+{
+ ioend->io_bio->bi_private = ioend;
+ ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+
+ if (wpc->ops->prepare_ioend)
+ error = wpc->ops->prepare_ioend(ioend, error);
+ if (error) {
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ ioend->io_bio->bi_status = errno_to_blk_status(error);
+ bio_endio(ioend->io_bio);
+ return error;
+ }
+
+ submit_bio(ioend->io_bio);
+ return 0;
+}
+
+static struct iomap_ioend *
+iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
+ loff_t offset, sector_t sector, struct writeback_control *wbc)
+{
+ struct iomap_ioend *ioend;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset);
+ bio_set_dev(bio, wpc->iomap.bdev);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ bio->bi_write_hint = inode->i_write_hint;
+ wbc_init_bio(wbc, bio);
+
+ ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_type = wpc->iomap.type;
+ ioend->io_flags = wpc->iomap.flags;
+ ioend->io_inode = inode;
+ ioend->io_size = 0;
+ ioend->io_offset = offset;
+ ioend->io_private = NULL;
+ ioend->io_bio = bio;
+ return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in iomap_finish_ioend().
+ */
+static struct bio *
+iomap_chain_bio(struct bio *prev)
+{
+ struct bio *new;
+
+ new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+ bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new->bi_iter.bi_sector = bio_end_sector(prev);
+ new->bi_opf = prev->bi_opf;
+ new->bi_write_hint = prev->bi_write_hint;
+
+ bio_chain(prev, new);
+ bio_get(prev); /* for iomap_finish_ioend */
+ submit_bio(prev);
+ return new;
+}
+
+static bool
+iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
+ sector_t sector)
+{
+ if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
+ (wpc->ioend->io_flags & IOMAP_F_SHARED))
+ return false;
+ if (wpc->iomap.type != wpc->ioend->io_type)
+ return false;
+ if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+ return false;
+ if (sector != bio_end_sector(wpc->ioend->io_bio))
+ return false;
+ return true;
+}
+
+/*
+ * Test to see if we have an existing ioend structure that we could append to
+ * first, otherwise finish off the current ioend and start another.
+ */
+static void
+iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
+ struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct list_head *iolist)
+{
+ sector_t sector = iomap_sector(&wpc->iomap, offset);
+ unsigned len = i_blocksize(inode);
+ unsigned poff = offset & (PAGE_SIZE - 1);
+ bool merged, same_page = false;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
+ if (wpc->ioend)
+ list_add(&wpc->ioend->io_list, iolist);
+ wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
+ }
+
+ merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
+ &same_page);
+ if (iop && !same_page)
+ atomic_inc(&iop->write_count);
+
+ if (!merged) {
+ if (bio_full(wpc->ioend->io_bio, len)) {
+ wpc->ioend->io_bio =
+ iomap_chain_bio(wpc->ioend->io_bio);
+ }
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ }
+
+ wpc->ioend->io_size += len;
+ wbc_account_cgroup_owner(wbc, page, len);
+}
+
+/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding blocks to is cached on the writepage context, and if the new block
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected. While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct inode *inode,
+ struct page *page, u64 end_offset)
+{
+ struct iomap_page *iop = to_iomap_page(page);
+ struct iomap_ioend *ioend, *next;
+ unsigned len = i_blocksize(inode);
+ u64 file_offset; /* file offset of page */
+ int error = 0, count = 0, i;
+ LIST_HEAD(submit_list);
+
+ WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0);
+
+ /*
+ * Walk through the page to find areas to write back. If we run off the
+ * end of the current map or find the current map invalid, grab a new
+ * one.
+ */
+ for (i = 0, file_offset = page_offset(page);
+ i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
+ i++, file_offset += len) {
+ if (iop && !test_bit(i, iop->uptodate))
+ continue;
+
+ error = wpc->ops->map_blocks(wpc, inode, file_offset);
+ if (error)
+ break;
+ if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
+ continue;
+ if (wpc->iomap.type == IOMAP_HOLE)
+ continue;
+ iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
+ &submit_list);
+ count++;
+ }
+
+ WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
+ WARN_ON_ONCE(!PageLocked(page));
+ WARN_ON_ONCE(PageWriteback(page));
+
+ /*
+ * We cannot cancel the ioend directly here on error. We may have
+ * already set other pages under writeback and hence we have to run I/O
+ * completion to mark the error state of the pages under writeback
+ * appropriately.
+ */
+ if (unlikely(error)) {
+ if (!count) {
+ /*
+ * If the current page hasn't been added to ioend, it
+ * won't be affected by I/O completions and we must
+ * discard and unlock it right here.
+ */
+ if (wpc->ops->discard_page)
+ wpc->ops->discard_page(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ goto done;
+ }
+
+ /*
+ * If the page was not fully cleaned, we need to ensure that the
+ * higher layers come back to it correctly. That means we need
+ * to keep the page dirty, and for WB_SYNC_ALL writeback we need
+ * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
+ * so another attempt to write this page in this writeback sweep
+ * will be made.
+ */
+ set_page_writeback_keepwrite(page);
+ } else {
+ clear_page_dirty_for_io(page);
+ set_page_writeback(page);
+ }
+
+ unlock_page(page);
+
+ /*
+ * Preserve the original error if there was one, otherwise catch
+ * submission errors here and propagate into subsequent ioend
+ * submissions.
+ */
+ list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+ int error2;
+
+ list_del_init(&ioend->io_list);
+ error2 = iomap_submit_ioend(wpc, ioend, error);
+ if (error2 && !error)
+ error = error2;
+ }
+
+ /*
+ * We can end up here with no error and nothing to write only if we race
+ * with a partial page truncate on a sub-page block sized filesystem.
+ */
+ if (!count)
+ end_page_writeback(page);
+done:
+ mapping_set_error(page->mapping, error);
+ return error;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ */
+static int
+iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
+{
+ struct iomap_writepage_ctx *wpc = data;
+ struct inode *inode = page->mapping->host;
+ pgoff_t end_index;
+ u64 end_offset;
+ loff_t offset;
+
+ trace_iomap_writepage(inode, page, 0, 0);
+
+ /*
+ * Refuse to write the page out if we are called from reclaim context.
+ *
+ * This avoids stack overflows when called from deeply used stacks in
+ * random callers for direct reclaim or memcg reclaim. We explicitly
+ * allow reclaim from kswapd as the stack usage there is relatively low.
+ *
+ * This should never happen except in the case of a VM regression so
+ * warn about it.
+ */
+ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ PF_MEMALLOC))
+ goto redirty;
+
+ /*
+ * Given that we do not allow direct reclaim to call us, we should
+ * never be called in a recursive filesystem reclaim context.
+ */
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
+ goto redirty;
+
+ /*
+ * Is this page beyond the end of the file?
+ *
+ * The page index is less than the end_index, adjust the end_offset
+ * to the highest offset that this page should represent.
+ * -----------------------------------------------------
+ * | file mapping | <EOF> |
+ * -----------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | |
+ * ^--------------------------------^----------|--------
+ * | desired writeback range | see else |
+ * ---------------------------------^------------------|
+ */
+ offset = i_size_read(inode);
+ end_index = offset >> PAGE_SHIFT;
+ if (page->index < end_index)
+ end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT;
+ else {
+ /*
+ * Check whether the page to write out is beyond or straddles
+ * i_size or not.
+ * -------------------------------------------------------
+ * | file mapping | <EOF> |
+ * -------------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
+ * ^--------------------------------^-----------|---------
+ * | | Straddles |
+ * ---------------------------------^-----------|--------|
+ */
+ unsigned offset_into_page = offset & (PAGE_SIZE - 1);
+
+ /*
+ * Skip the page if it is fully outside i_size, e.g. due to a
+ * truncate operation that is in progress. We must redirty the
+ * page so that reclaim stops reclaiming it. Otherwise
+ * iomap_vm_releasepage() is called on it and gets confused.
+ *
+ * Note that the end_index is unsigned long, it would overflow
+ * if the given offset is greater than 16TB on 32-bit system
+ * and if we do check the page is fully outside i_size or not
+ * via "if (page->index >= end_index + 1)" as "end_index + 1"
+ * will be evaluated to 0. Hence this page will be redirtied
+ * and be written out repeatedly which would result in an
+ * infinite loop, the user program that perform this operation
+ * will hang. Instead, we can verify this situation by checking
+ * if the page to write is totally beyond the i_size or if it's
+ * offset is just equal to the EOF.
+ */
+ if (page->index > end_index ||
+ (page->index == end_index && offset_into_page == 0))
+ goto redirty;
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each
+ * and every writepage invocation because it may be mmapped.
+ * "A file is mapped in multiples of the page size. For a file
+ * that is not a multiple of the page size, the remaining
+ * memory is zeroed when mapped, and writes to that region are
+ * not written out to the file."
+ */
+ zero_user_segment(page, offset_into_page, PAGE_SIZE);
+
+ /* Adjust the end_offset to the end of file */
+ end_offset = offset;
+ }
+
+ return iomap_writepage_map(wpc, wbc, inode, page, end_offset);
+
+redirty:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+}
+
+int
+iomap_writepage(struct page *page, struct writeback_control *wbc,
+ struct iomap_writepage_ctx *wpc,
+ const struct iomap_writeback_ops *ops)
+{
+ int ret;
+
+ wpc->ops = ops;
+ ret = iomap_do_writepage(page, wbc, wpc);
+ if (!wpc->ioend)
+ return ret;
+ return iomap_submit_ioend(wpc, wpc->ioend, ret);
+}
+EXPORT_SYMBOL_GPL(iomap_writepage);
+
+int
+iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
+ struct iomap_writepage_ctx *wpc,
+ const struct iomap_writeback_ops *ops)
+{
+ int ret;
+
+ wpc->ops = ops;
+ ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
+ if (!wpc->ioend)
+ return ret;
+ return iomap_submit_ioend(wpc, wpc->ioend, ret);
+}
+EXPORT_SYMBOL_GPL(iomap_writepages);
+
+static int __init iomap_init(void)
+{
+ return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_ioend, io_inline_bio),
+ BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 1fc28c2da279..420c0c82f0ac 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -318,7 +318,9 @@ zero_tail:
if (pad)
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
}
- return copied ? copied : ret;
+ if (copied)
+ return copied;
+ return ret;
}
static loff_t
@@ -358,7 +360,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_dio *dio = data;
@@ -392,7 +394,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
*/
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
- const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+ bool wait_for_completion)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -400,7 +403,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos, start = pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT;
- bool wait_for_completion = is_sync_kiocb(iocb);
struct blk_plug plug;
struct iomap_dio *dio;
@@ -409,6 +411,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!count)
return 0;
+ if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
+ return -EIO;
+
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return -ENOMEM;
@@ -430,7 +435,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (pos >= dio->i_size)
goto out_free_dio;
- if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
+ if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
flags |= IOMAP_WRITE;
@@ -497,8 +502,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
pos += ret;
- if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+ if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
+ /*
+ * We only report that we've read data up to i_size.
+ * Revert iter to a state corresponding to that as
+ * some callers (such as splice code) rely on it.
+ */
+ iov_iter_revert(iter, pos - dio->i_size);
break;
+ }
} while ((count = iov_iter_count(iter)) > 0);
blk_finish_plug(&plug);
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index f26fdd36e383..bccf305ea9ce 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -44,7 +44,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
static loff_t
iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct fiemap_ctx *ctx = data;
loff_t ret = length;
@@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(iomap_fiemap);
static loff_t
iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
sector_t *bno = data, addr;
@@ -133,12 +133,16 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
struct inode *inode = mapping->host;
loff_t pos = bno << inode->i_blkbits;
unsigned blocksize = i_blocksize(inode);
+ int ret;
if (filemap_write_and_wait(mapping))
return 0;
bno = 0;
- iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+ ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno,
+ iomap_bmap_actor);
+ if (ret)
+ return 0;
return bno;
}
EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index c04bad4b2b43..89f61d93c0bc 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -119,7 +119,7 @@ out:
static loff_t
iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
switch (iomap->type) {
case IOMAP_UNWRITTEN:
@@ -165,7 +165,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole);
static loff_t
iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
switch (iomap->type) {
case IOMAP_HOLE:
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index 152a230f668d..a648dbf6991e 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -76,7 +76,8 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
* distinction between written and unwritten extents.
*/
static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
- loff_t count, void *data, struct iomap *iomap)
+ loff_t count, void *data, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct iomap_swapfile_info *isi = data;
int error;
diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c
new file mode 100644
index 000000000000..da217246b1a9
--- /dev/null
+++ b/fs/iomap/trace.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Christoph Hellwig
+ */
+#include <linux/iomap.h>
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
new file mode 100644
index 000000000000..6dc227b8c47e
--- /dev/null
+++ b/fs/iomap/trace.h
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2009-2019 Christoph Hellwig
+ *
+ * NOTE: none of these tracepoints shall be consider a stable kernel ABI
+ * as they can change at any time.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iomap
+
+#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _IOMAP_TRACE_H
+
+#include <linux/tracepoint.h>
+
+struct inode;
+
+DECLARE_EVENT_CLASS(iomap_readpage_class,
+ TP_PROTO(struct inode *inode, int nr_pages),
+ TP_ARGS(inode, nr_pages),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(int, nr_pages)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->nr_pages = nr_pages;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->nr_pages)
+)
+
+#define DEFINE_READPAGE_EVENT(name) \
+DEFINE_EVENT(iomap_readpage_class, name, \
+ TP_PROTO(struct inode *inode, int nr_pages), \
+ TP_ARGS(inode, nr_pages))
+DEFINE_READPAGE_EVENT(iomap_readpage);
+DEFINE_READPAGE_EVENT(iomap_readpages);
+
+DECLARE_EVENT_CLASS(iomap_page_class,
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+ unsigned int len),
+ TP_ARGS(inode, page, off, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(pgoff_t, pgoff)
+ __field(loff_t, size)
+ __field(unsigned long, offset)
+ __field(unsigned int, length)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pgoff = page_offset(page);
+ __entry->size = i_size_read(inode);
+ __entry->offset = off;
+ __entry->length = len;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+ "length %x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pgoff,
+ __entry->size,
+ __entry->offset,
+ __entry->length)
+)
+
+#define DEFINE_PAGE_EVENT(name) \
+DEFINE_EVENT(iomap_page_class, name, \
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+ unsigned int len), \
+ TP_ARGS(inode, page, off, len))
+DEFINE_PAGE_EVENT(iomap_writepage);
+DEFINE_PAGE_EVENT(iomap_releasepage);
+DEFINE_PAGE_EVENT(iomap_invalidatepage);
+
+#define IOMAP_TYPE_STRINGS \
+ { IOMAP_HOLE, "HOLE" }, \
+ { IOMAP_DELALLOC, "DELALLOC" }, \
+ { IOMAP_MAPPED, "MAPPED" }, \
+ { IOMAP_UNWRITTEN, "UNWRITTEN" }, \
+ { IOMAP_INLINE, "INLINE" }
+
+#define IOMAP_FLAGS_STRINGS \
+ { IOMAP_WRITE, "WRITE" }, \
+ { IOMAP_ZERO, "ZERO" }, \
+ { IOMAP_REPORT, "REPORT" }, \
+ { IOMAP_FAULT, "FAULT" }, \
+ { IOMAP_DIRECT, "DIRECT" }, \
+ { IOMAP_NOWAIT, "NOWAIT" }
+
+#define IOMAP_F_FLAGS_STRINGS \
+ { IOMAP_F_NEW, "NEW" }, \
+ { IOMAP_F_DIRTY, "DIRTY" }, \
+ { IOMAP_F_SHARED, "SHARED" }, \
+ { IOMAP_F_MERGED, "MERGED" }, \
+ { IOMAP_F_BUFFER_HEAD, "BH" }, \
+ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }
+
+DECLARE_EVENT_CLASS(iomap_class,
+ TP_PROTO(struct inode *inode, struct iomap *iomap),
+ TP_ARGS(inode, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(u64, addr)
+ __field(loff_t, offset)
+ __field(u64, length)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(dev_t, bdev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->offset = iomap->offset;
+ __entry->length = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld "
+ "length %llu type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ MAJOR(__entry->bdev), MINOR(__entry->bdev),
+ __entry->addr,
+ __entry->offset,
+ __entry->length,
+ __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+)
+
+#define DEFINE_IOMAP_EVENT(name) \
+DEFINE_EVENT(iomap_class, name, \
+ TP_PROTO(struct inode *inode, struct iomap *iomap), \
+ TP_ARGS(inode, iomap))
+DEFINE_IOMAP_EVENT(iomap_apply_dstmap);
+DEFINE_IOMAP_EVENT(iomap_apply_srcmap);
+
+TRACE_EVENT(iomap_apply,
+ TP_PROTO(struct inode *inode, loff_t pos, loff_t length,
+ unsigned int flags, const void *ops, void *actor,
+ unsigned long caller),
+ TP_ARGS(inode, pos, length, flags, ops, actor, caller),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(loff_t, pos)
+ __field(loff_t, length)
+ __field(unsigned int, flags)
+ __field(const void *, ops)
+ __field(void *, actor)
+ __field(unsigned long, caller)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pos = pos;
+ __entry->length = length;
+ __entry->flags = flags;
+ __entry->ops = ops;
+ __entry->actor = actor;
+ __entry->caller = caller;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) "
+ "ops %ps caller %pS actor %ps",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->length,
+ __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
+ __entry->flags,
+ __entry->ops,
+ (void *)__entry->caller,
+ __entry->actor)
+);
+
+#endif /* _IOMAP_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index a1909066bde6..8fff6677a5da 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -110,7 +110,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd2_space_needed(journal);
+ nblocks = journal->j_max_transaction_buffers;
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
mutex_lock_io(&journal->j_checkpoint_mutex);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 132fb92098c7..7f0b362b3842 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (jh->b_committed_data) {
struct buffer_head *bh = jh2bh(jh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
jbd2_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
jbd2_journal_refile_buffer(journal, jh);
}
@@ -560,8 +560,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
stats.run.rs_logging = jiffies;
stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
stats.run.rs_logging);
- stats.run.rs_blocks =
- atomic_read(&commit_transaction->t_outstanding_credits);
+ stats.run.rs_blocks = commit_transaction->t_nr_buffers;
stats.run.rs_blocks_logged = 0;
J_ASSERT(commit_transaction->t_nr_buffers <=
@@ -642,8 +641,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/*
* start_this_handle() uses t_outstanding_credits to determine
- * the free space in the log, but this counter is changed
- * by jbd2_journal_next_log_block() also.
+ * the free space in the log.
*/
atomic_dec(&commit_transaction->t_outstanding_credits);
@@ -727,7 +725,6 @@ start_journal_io:
submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
}
cond_resched();
- stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
@@ -814,6 +811,7 @@ start_journal_io:
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
/*
* The list contains temporary buffer heads created by
@@ -859,6 +857,7 @@ start_journal_io:
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -880,6 +879,7 @@ start_journal_io:
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
+ stats.run.rs_blocks_logged++;
if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
@@ -888,6 +888,9 @@ start_journal_io:
if (err)
jbd2_journal_abort(journal, err);
+ WARN_ON_ONCE(
+ atomic_read(&commit_transaction->t_outstanding_credits) < 0);
+
/*
* Now disk caches for filesystem device are flushed so we are safe to
* erase checkpointed transactions from the log by updating journal
@@ -918,6 +921,7 @@ restart_loop:
transaction_t *cp_transaction;
struct buffer_head *bh;
int try_to_free = 0;
+ bool drop_ref;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
@@ -927,7 +931,7 @@ restart_loop:
* done with it.
*/
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
/*
@@ -1022,8 +1026,10 @@ restart_loop:
try_to_free = 1;
}
JBUFFER_TRACE(jh, "refile or unfile buffer");
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop_ref = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
+ if (drop_ref)
+ jbd2_journal_put_journal_head(jh);
if (try_to_free)
release_buffer_page(bh); /* Drops bh reference */
else
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1c58859aa592..5e408ee24a1a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
repeat:
/*
* If a new transaction has already done a buffer copy-out, then
@@ -405,13 +405,13 @@ repeat:
if (need_copy_out && !done_copy_out) {
char *tmp;
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
brelse(new_bh);
return -ENOMEM;
}
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
goto repeat;
@@ -464,7 +464,7 @@ repeat:
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
return do_escape | (done_copy_out << 1);
}
@@ -840,6 +840,7 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return NULL;
+ atomic_dec(&transaction->t_outstanding_credits);
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
header = (journal_header_t *)bh->b_data;
@@ -1098,6 +1099,16 @@ static void jbd2_stats_proc_exit(journal_t *journal)
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
+/* Minimum size of descriptor tag */
+static int jbd2_min_tag_size(void)
+{
+ /*
+ * Tag with 32-bit block numbers does not use last four bytes of the
+ * structure
+ */
+ return sizeof(journal_block_tag_t) - 4;
+}
+
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
@@ -1156,7 +1167,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_maxlen = len;
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
+ /* We need enough buffers to write out full descriptor block. */
+ n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
GFP_KERNEL);
@@ -1488,6 +1500,21 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
+static int journal_revoke_records_per_block(journal_t *journal)
+{
+ int record_size;
+ int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
+
+ if (jbd2_has_feature_64bit(journal))
+ record_size = 8;
+ else
+ record_size = 4;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ space -= sizeof(struct jbd2_journal_block_tail);
+ return space / record_size;
+}
+
/*
* Read the superblock for a given journal, performing initial
* validation of the format.
@@ -1596,6 +1623,8 @@ static int journal_get_superblock(journal_t *journal)
sizeof(sb->s_uuid));
}
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
set_buffer_verified(bh);
return 0;
@@ -1916,6 +1945,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer);
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
return 1;
#undef COMPAT_FEATURE_ON
@@ -1946,6 +1977,8 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
@@ -2410,6 +2443,8 @@ static struct journal_head *journal_alloc_journal_head(void)
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
}
+ if (ret)
+ spin_lock_init(&ret->b_state_lock);
return ret;
}
@@ -2529,17 +2564,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
J_ASSERT_BH(bh, buffer_jbd(bh));
J_ASSERT_BH(bh, jh2bh(jh) == bh);
BUFFER_TRACE(bh, "remove journal_head");
+
+ /* Unlink before dropping the lock */
+ bh->b_private = NULL;
+ jh->b_bh = NULL; /* debug, really */
+ clear_buffer_jbd(bh);
+}
+
+static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+{
if (jh->b_frozen_data) {
printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
- jbd2_free(jh->b_frozen_data, bh->b_size);
+ jbd2_free(jh->b_frozen_data, b_size);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
- jbd2_free(jh->b_committed_data, bh->b_size);
+ jbd2_free(jh->b_committed_data, b_size);
}
- bh->b_private = NULL;
- jh->b_bh = NULL; /* debug, really */
- clear_buffer_jbd(bh);
journal_free_journal_head(jh);
}
@@ -2557,9 +2598,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
if (!jh->b_jcount) {
__journal_remove_journal_head(bh);
jbd_unlock_bh_journal_head(bh);
+ journal_release_journal_head(jh, bh->b_size);
__brelse(bh);
- } else
+ } else {
jbd_unlock_bh_journal_head(bh);
+ }
}
/*
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f08073d7bbf5..fa608788b93d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -371,6 +371,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
}
#endif
+ if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) {
+ if (!bh_in)
+ brelse(bh);
+ return -EIO;
+ }
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
@@ -391,6 +396,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
__brelse(bh);
}
}
+ handle->h_revoke_credits--;
jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bee8498d7792..27b9f9dee434 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -63,6 +63,28 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
}
/*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+ int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+ int tags_per_block;
+
+ /* Subtract UUID */
+ tag_space -= 16;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ tag_space -= sizeof(struct jbd2_journal_block_tail);
+ /* Commit code leaves a slack space of 16 bytes at the end of block */
+ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+ /*
+ * Revoke descriptors are accounted separately so we need to reserve
+ * space for commit block and normal transaction descriptor blocks.
+ */
+ return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
+ tags_per_block);
+}
+
+/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
* Simply initialise a new transaction. Initialize it in
@@ -88,7 +110,9 @@ static void jbd2_get_transaction(journal_t *journal,
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
+ jbd2_descriptor_blocks_per_trans(journal) +
atomic_read(&journal->j_reserved_credits));
+ atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -258,12 +282,13 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* *before* starting to dirty potentially checkpointed buffers
* in the new transaction.
*/
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ if (jbd2_log_space_left(journal) <
+ journal->j_max_transaction_buffers)
__jbd2_log_wait_for_space(journal);
write_unlock(&journal->j_state_lock);
return 1;
@@ -299,12 +324,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- int blocks = handle->h_buffer_credits;
+ int blocks = handle->h_total_credits;
int rsv_blocks = 0;
unsigned long ts = jiffies;
if (handle->h_rsv_handle)
- rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+ rsv_blocks = handle->h_rsv_handle->h_total_credits;
/*
* Limit the number of reserved credits to 1/2 of maximum transaction
@@ -405,6 +430,7 @@ repeat:
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
handle->h_requested_credits = blocks;
+ handle->h_revoke_credits_requested = handle->h_revoke_credits;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
@@ -431,15 +457,15 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- handle->h_buffer_credits = nblocks;
+ handle->h_total_credits = nblocks;
handle->h_ref = 1;
return handle;
}
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
- gfp_t gfp_mask, unsigned int type,
- unsigned int line_no)
+ int revoke_records, gfp_t gfp_mask,
+ unsigned int type, unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -453,6 +479,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
return handle;
}
+ nblocks += DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
@@ -468,6 +496,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
rsv_handle->h_journal = journal;
handle->h_rsv_handle = rsv_handle;
}
+ handle->h_revoke_credits = revoke_records;
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
@@ -508,16 +537,21 @@ EXPORT_SYMBOL(jbd2__journal_start);
*/
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
-void jbd2_journal_free_reserved(handle_t *handle)
+static void __jbd2_journal_unreserve_handle(handle_t *handle)
{
journal_t *journal = handle->h_journal;
WARN_ON(!handle->h_reserved);
- sub_reserved_credits(journal, handle->h_buffer_credits);
+ sub_reserved_credits(journal, handle->h_total_credits);
+}
+
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ __jbd2_journal_unreserve_handle(handle);
jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);
@@ -571,7 +605,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
handle->h_line_no = line_no;
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
handle->h_transaction->t_tid, type,
- line_no, handle->h_buffer_credits);
+ line_no, handle->h_total_credits);
return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);
@@ -580,6 +614,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
* int jbd2_journal_extend() - extend buffer credits.
* @handle: handle to 'extend'
* @nblocks: nr blocks to try to extend by.
+ * @revoke_records: number of revoke records to try to extend by.
*
* Some transactions, such as large extends and truncates, can be done
* atomically all at once or in several stages. The operation requests
@@ -596,7 +631,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
* return code < 0 implies an error
* return code > 0 implies normal transaction-full status.
*/
-int jbd2_journal_extend(handle_t *handle, int nblocks)
+int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -618,6 +653,12 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
goto error_out;
}
+ nblocks += DIV_ROUND_UP(
+ handle->h_revoke_credits_requested + revoke_records,
+ journal->j_revoke_records_per_block) -
+ DIV_ROUND_UP(
+ handle->h_revoke_credits_requested,
+ journal->j_revoke_records_per_block);
spin_lock(&transaction->t_handle_lock);
wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits);
@@ -629,22 +670,16 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
goto unlock;
}
- if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
- jbd2_log_space_left(journal)) {
- jbd_debug(3, "denied handle %p %d blocks: "
- "insufficient log space\n", handle, nblocks);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
- }
-
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
transaction->t_tid,
handle->h_type, handle->h_line_no,
- handle->h_buffer_credits,
+ handle->h_total_credits,
nblocks);
- handle->h_buffer_credits += nblocks;
+ handle->h_total_credits += nblocks;
handle->h_requested_credits += nblocks;
+ handle->h_revoke_credits += revoke_records;
+ handle->h_revoke_credits_requested += revoke_records;
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -655,11 +690,55 @@ error_out:
return result;
}
+static void stop_this_handle(handle_t *handle)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int revokes;
+
+ J_ASSERT(journal_current_handle() == handle);
+ J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+ current->journal_info = NULL;
+ /*
+ * Subtract necessary revoke descriptor blocks from handle credits. We
+ * take care to account only for revoke descriptor blocks the
+ * transaction will really need as large sequences of transactions with
+ * small numbers of revokes are relatively common.
+ */
+ revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
+ if (revokes) {
+ int t_revokes, revoke_descriptors;
+ int rr_per_blk = journal->j_revoke_records_per_block;
+
+ WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
+ > handle->h_total_credits);
+ t_revokes = atomic_add_return(revokes,
+ &transaction->t_outstanding_revokes);
+ revoke_descriptors =
+ DIV_ROUND_UP(t_revokes, rr_per_blk) -
+ DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
+ handle->h_total_credits -= revoke_descriptors;
+ }
+ atomic_sub(handle->h_total_credits,
+ &transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle)
+ __jbd2_journal_unreserve_handle(handle->h_rsv_handle);
+ if (atomic_dec_and_test(&transaction->t_updates))
+ wake_up(&journal->j_wait_updates);
+
+ rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
+}
/**
* int jbd2_journal_restart() - restart a handle .
* @handle: handle to restart
* @nblocks: nr credits requested
+ * @revoke_records: number of revoke record credits requested
* @gfp_mask: memory allocation flags (for start_this_handle)
*
* Restart a handle for a multi-transaction filesystem
@@ -672,56 +751,48 @@ error_out:
* credits. We preserve reserved handle if there's any attached to the
* passed in handle.
*/
-int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
+ gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
tid_t tid;
- int need_to_start, ret;
+ int need_to_start;
+ int ret;
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
journal = transaction->t_journal;
+ tid = transaction->t_tid;
/*
* First unlink the handle from its current transaction, and start the
* commit on that.
*/
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- J_ASSERT(journal_current_handle() == handle);
-
- read_lock(&journal->j_state_lock);
- spin_lock(&transaction->t_handle_lock);
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
- if (handle->h_rsv_handle) {
- sub_reserved_credits(journal,
- handle->h_rsv_handle->h_buffer_credits);
- }
- if (atomic_dec_and_test(&transaction->t_updates))
- wake_up(&journal->j_wait_updates);
- tid = transaction->t_tid;
- spin_unlock(&transaction->t_handle_lock);
+ jbd_debug(2, "restarting handle %p\n", handle);
+ stop_this_handle(handle);
handle->h_transaction = NULL;
- current->journal_info = NULL;
- jbd_debug(2, "restarting handle %p\n", handle);
+ /*
+ * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
+ * get rid of pointless j_state_lock traffic like this.
+ */
+ read_lock(&journal->j_state_lock);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
- handle->h_buffer_credits = nblocks;
- /*
- * Restore the original nofs context because the journal restart
- * is basically the same thing as journal stop and start.
- * start_this_handle will start a new nofs context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ handle->h_total_credits = nblocks +
+ DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
+ handle->h_revoke_credits = revoke_records;
ret = start_this_handle(journal, handle, gfp_mask);
+ trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
+ ret ? 0 : handle->h_transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ handle->h_total_credits);
return ret;
}
EXPORT_SYMBOL(jbd2__journal_restart);
@@ -729,7 +800,7 @@ EXPORT_SYMBOL(jbd2__journal_restart);
int jbd2_journal_restart(handle_t *handle, int nblocks)
{
- return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+ return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
}
EXPORT_SYMBOL(jbd2_journal_restart);
@@ -879,7 +950,7 @@ repeat:
start_lock = jiffies;
lock_buffer(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
/* If it takes too long to lock the buffer, trace it */
time_lock = jbd2_time_diff(start_lock, jiffies);
@@ -929,7 +1000,7 @@ repeat:
error = -EROFS;
if (is_handle_aborted(handle)) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto out;
}
error = 0;
@@ -993,7 +1064,7 @@ repeat:
*/
if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
@@ -1014,7 +1085,7 @@ repeat:
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
goto repeat;
@@ -1033,7 +1104,7 @@ attach_next:
jh->b_next_transaction = transaction;
done:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* If we are about to journal a buffer, then any revoke pending on it is
@@ -1172,7 +1243,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
* that case: the transaction must have deleted the buffer for it to be
* reused here.
*/
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
jh->b_transaction == NULL ||
(jh->b_transaction == journal->j_committing_transaction &&
@@ -1207,7 +1278,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* akpm: I added this. ext3_alloc_branch can pick up new indirect
@@ -1275,13 +1346,13 @@ repeat:
committed_data = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS|__GFP_NOFAIL);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (!jh->b_committed_data) {
/* Copy out the current buffer contents into the
* preserved, committed copy. */
JBUFFER_TRACE(jh, "generate b_committed data");
if (!committed_data) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto repeat;
}
@@ -1289,7 +1360,7 @@ repeat:
committed_data = NULL;
memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
jbd2_journal_put_journal_head(jh);
if (unlikely(committed_data))
@@ -1390,16 +1461,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
*/
if (jh->b_transaction != transaction &&
jh->b_next_transaction != transaction) {
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
jh->b_next_transaction == transaction);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
if (jh->b_modified == 1) {
/* If it's in our transaction it must be in BJ_Metadata list. */
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata) {
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)
pr_err("JBD2: assertion failure: h_type=%u "
@@ -1409,13 +1480,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
jh->b_jlist);
J_ASSERT_JH(jh, jh->b_transaction != transaction ||
jh->b_jlist == BJ_Metadata);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
goto out;
}
journal = transaction->t_journal;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (jh->b_modified == 0) {
/*
@@ -1423,12 +1494,12 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* of the transaction. This needs to be done
* once a transaction -bzzz
*/
- if (handle->h_buffer_credits <= 0) {
+ if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
ret = -ENOSPC;
goto out_unlock_bh;
}
jh->b_modified = 1;
- handle->h_buffer_credits--;
+ handle->h_total_credits--;
}
/*
@@ -1501,7 +1572,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
JBUFFER_TRACE(jh, "exit");
return ret;
@@ -1539,18 +1610,20 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
BUFFER_TRACE(bh, "entry");
- jbd_lock_bh_state(bh);
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh) {
+ __bforget(bh);
+ return 0;
+ }
- if (!buffer_jbd(bh))
- goto not_jbd;
- jh = bh2jh(bh);
+ spin_lock(&jh->b_state_lock);
/* Critical error: attempting to delete a bitmap buffer, maybe?
* Don't do any jbd operations, and return an error. */
if (!J_EXPECT_JH(jh, !jh->b_committed_data,
"inconsistent data on disk")) {
err = -EIO;
- goto not_jbd;
+ goto drop;
}
/* keep track of whether or not this transaction modified us */
@@ -1598,10 +1671,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__jbd2_journal_unfile_buffer(jh);
- if (!buffer_jbd(bh)) {
- spin_unlock(&journal->j_list_lock);
- goto not_jbd;
- }
+ jbd2_journal_put_journal_head(jh);
}
spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {
@@ -1643,7 +1713,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (!jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "belongs to none transaction");
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1653,7 +1723,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (!buffer_dirty(bh)) {
__jbd2_journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1666,20 +1736,15 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
spin_unlock(&journal->j_list_lock);
}
-
- jbd_unlock_bh_state(bh);
- __brelse(bh);
drop:
+ __brelse(bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
- handle->h_buffer_credits++;
+ handle->h_total_credits++;
}
return err;
-
-not_jbd:
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
}
/**
@@ -1706,45 +1771,34 @@ int jbd2_journal_stop(handle_t *handle)
tid_t tid;
pid_t pid;
+ if (--handle->h_ref > 0) {
+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ handle->h_ref);
+ if (is_handle_aborted(handle))
+ return -EIO;
+ return 0;
+ }
if (!transaction) {
/*
- * Handle is already detached from the transaction so
- * there is nothing to do other than decrease a refcount,
- * or free the handle if refcount drops to zero
+ * Handle is already detached from the transaction so there is
+ * nothing to do other than free the handle.
*/
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- } else {
- if (handle->h_rsv_handle)
- jbd2_free_handle(handle->h_rsv_handle);
- goto free_and_exit;
- }
+ memalloc_nofs_restore(handle->saved_alloc_context);
+ goto free_and_exit;
}
journal = transaction->t_journal;
-
- J_ASSERT(journal_current_handle() == handle);
+ tid = transaction->t_tid;
if (is_handle_aborted(handle))
err = -EIO;
- else
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- }
jbd_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- transaction->t_tid,
- handle->h_type, handle->h_line_no,
+ tid, handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
(handle->h_requested_credits -
- handle->h_buffer_credits));
+ handle->h_total_credits));
/*
* Implement synchronous transaction batching. If the handle
@@ -1804,19 +1858,13 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
- current->journal_info = NULL;
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
/*
* If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
- * transaction is occupying too much of the log, or if the
- * transaction is too old now.
+ * going! We also want to force a commit if the transaction is too
+ * old now.
*/
if (handle->h_sync ||
- (atomic_read(&transaction->t_outstanding_credits) >
- journal->j_max_transaction_buffers) ||
time_after_eq(jiffies, transaction->t_expires)) {
/* Do this even for aborted journals: an abort still
* completes the commit thread, it just doesn't write
@@ -1825,7 +1873,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
- jbd2_log_start_commit(journal, transaction->t_tid);
+ jbd2_log_start_commit(journal, tid);
/*
* Special case: JBD2_SYNC synchronous updates require us
@@ -1836,31 +1884,19 @@ int jbd2_journal_stop(handle_t *handle)
}
/*
- * Once we drop t_updates, if it goes to zero the transaction
- * could start committing on us and eventually disappear. So
- * once we do this, we must not dereference transaction
- * pointer again.
+ * Once stop_this_handle() drops t_updates, the transaction could start
+ * committing on us and eventually disappear. So we must not
+ * dereference transaction pointer again after calling
+ * stop_this_handle().
*/
- tid = transaction->t_tid;
- if (atomic_dec_and_test(&transaction->t_updates)) {
- wake_up(&journal->j_wait_updates);
- if (journal->j_barrier_count)
- wake_up(&journal->j_wait_transaction_locked);
- }
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+ stop_this_handle(handle);
if (wait_for_commit)
err = jbd2_log_wait_commit(journal, tid);
- if (handle->h_rsv_handle)
- jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
- /*
- * Scope of the GFP_NOFS context is over here and so we can restore the
- * original alloc context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
return err;
}
@@ -1878,7 +1914,7 @@ free_and_exit:
*
* j_list_lock is held.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1902,7 +1938,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
*
* Called with j_list_lock held, and the journal may not be locked.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1934,7 +1970,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
transaction = jh->b_transaction;
if (transaction)
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1971,17 +2007,15 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
}
/*
- * Remove buffer from all transactions.
+ * Remove buffer from all transactions. The caller is responsible for dropping
+ * the jh reference that belonged to the transaction.
*
* Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
*/
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
__jbd2_journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
- jbd2_journal_put_journal_head(jh);
}
void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
@@ -1990,18 +2024,19 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
/* Get reference so that buffer cannot be freed before we unlock it */
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
__jbd2_journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
__brelse(bh);
}
/*
* Called from jbd2_journal_try_to_free_buffers().
*
- * Called under jbd_lock_bh_state(bh)
+ * Called under jh->b_state_lock
*/
static void
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
@@ -2088,10 +2123,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (!jh)
continue;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
__journal_try_to_free_buffer(journal, bh);
+ spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
- jbd_unlock_bh_state(bh);
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
@@ -2112,7 +2147,7 @@ busy:
*
* Called under j_list_lock.
*
- * Called under jbd_lock_bh_state(bh).
+ * Called under jh->b_state_lock.
*/
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
@@ -2133,6 +2168,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
} else {
JBUFFER_TRACE(jh, "on running transaction");
__jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_put_journal_head(jh);
}
return may_free;
}
@@ -2199,18 +2235,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* holding the page lock. --sct
*/
- if (!buffer_jbd(bh))
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh)
goto zap_buffer_unlocked;
/* OK, we have data buffer in journaled mode */
write_lock(&journal->j_state_lock);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- jh = jbd2_journal_grab_journal_head(bh);
- if (!jh)
- goto zap_buffer_no_jh;
-
/*
* We cannot remove the buffer from checkpoint lists until the
* transaction adding inode to orphan list (let's call it T)
@@ -2289,10 +2322,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* for commit and try again.
*/
if (partial_page) {
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return -EBUSY;
}
/*
@@ -2304,10 +2337,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return 0;
} else {
/* Good, the buffer belongs to the running transaction.
@@ -2331,11 +2364,10 @@ zap_buffer:
* here.
*/
jh->b_modified = 0;
- jbd2_journal_put_journal_head(jh);
-zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
clear_buffer_dirty(bh);
J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2422,7 +2454,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
assert_spin_locked(&transaction->t_journal->j_list_lock);
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2484,11 +2516,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
void jbd2_journal_file_buffer(struct journal_head *jh,
transaction_t *transaction, int jlist)
{
- jbd_lock_bh_state(jh2bh(jh));
+ spin_lock(&jh->b_state_lock);
spin_lock(&transaction->t_journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, jlist);
spin_unlock(&transaction->t_journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
+ spin_unlock(&jh->b_state_lock);
}
/*
@@ -2498,23 +2530,25 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
* buffer on that transaction's metadata list.
*
* Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
+ * Called under jh->b_state_lock
*
- * jh and bh may be already free when this function returns
+ * When this function returns true, there's no next transaction to refile to
+ * and the caller has to drop jh reference through
+ * jbd2_journal_put_journal_head().
*/
-void __jbd2_journal_refile_buffer(struct journal_head *jh)
+bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
if (jh->b_transaction)
assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
/* If the buffer is now unused, just drop it. */
if (jh->b_next_transaction == NULL) {
__jbd2_journal_unfile_buffer(jh);
- return;
+ return true;
}
/*
@@ -2542,6 +2576,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
if (was_dirty)
set_buffer_jbddirty(bh);
+ return false;
}
/*
@@ -2552,16 +2587,15 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
*/
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
- struct buffer_head *bh = jh2bh(jh);
+ bool drop;
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
spin_unlock(&journal->j_list_lock);
- __brelse(bh);
+ if (drop)
+ jbd2_journal_put_journal_head(jh);
}
/*
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index b2d9f79c4a7c..9d96e6871e1a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -438,7 +438,7 @@ void kernfs_put_active(struct kernfs_node *kn)
return;
if (kernfs_lockdep(kn))
- rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ rwsem_release(&kn->dep_map, _RET_IP_);
v = atomic_dec_return(&kn->active);
if (likely(v != KN_DEACTIVATED_BIAS))
return;
@@ -476,7 +476,7 @@ static void kernfs_drain(struct kernfs_node *kn)
if (kernfs_lockdep(kn)) {
lock_acquired(&kn->dep_map, _RET_IP_);
- rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ rwsem_release(&kn->dep_map, _RET_IP_);
}
kernfs_drain_open_files(kn);
diff --git a/fs/namei.c b/fs/namei.c
index 671c3c1a3425..2dda552bcf7a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -925,7 +925,7 @@ static inline int may_follow_link(struct nameidata *nd)
return -ECHILD;
audit_inode(nd->name, nd->stack[0].link.dentry, 0);
- audit_log_link_denied("follow_link");
+ audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
return -EACCES;
}
@@ -993,7 +993,7 @@ static int may_linkat(struct path *link)
if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
return 0;
- audit_log_link_denied("linkat");
+ audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
return -EPERM;
}
@@ -1031,6 +1031,10 @@ static int may_create_in_sticky(struct dentry * const dir,
(dir->d_inode->i_mode & 0020 &&
((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
(sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
+ const char *operation = S_ISFIFO(inode->i_mode) ?
+ "sticky_create_fifo" :
+ "sticky_create_regular";
+ audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
return -EACCES;
}
return 0;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 91b9dac6b2cc..4ba73dbf3e8d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1354,6 +1354,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_SYNC:
case NILFS_IOCTL_RESIZE:
case NILFS_IOCTL_SET_ALLOC_RANGE:
+ case FITRIM:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8508ab575017..0aa362b88550 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -523,7 +523,7 @@ static const struct file_operations fanotify_fops = {
.fasync = NULL,
.release = fanotify_release,
.unlocked_ioctl = fanotify_ioctl,
- .compat_ioctl = fanotify_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = noop_llseek,
};
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 1e2bfd26b352..ef83f4020554 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -50,7 +50,7 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
f.handle.handle_bytes = sizeof(f.pad);
size = f.handle.handle_bytes >> 2;
- ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
+ ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, NULL);
if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
return;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2ecef6155fc0..3e77b728a22b 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -381,8 +381,6 @@ out:
}
EXPORT_SYMBOL_GPL(fsnotify);
-extern struct kmem_cache *fsnotify_mark_connector_cachep;
-
static __init int fsnotify_init(void)
{
int ret;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index f3462828a0e2..ff2063ec6b0f 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -65,4 +65,6 @@ extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+extern struct kmem_cache *fsnotify_mark_connector_cachep;
+
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 3e7da392aa6f..bb981ec76456 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
+ if (IS_ERR_OR_NULL(acl))
+ return PTR_ERR_OR_ZERO(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (ret)
return ret;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f9baefc76cf9..88534eb0e7c2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2288,9 +2288,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
int ret = 0;
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
- if (handle->h_buffer_credits < credits)
+ if (jbd2_handle_buffer_credits(handle) < credits)
ret = ocfs2_extend_trans(handle,
- credits - handle->h_buffer_credits);
+ credits - jbd2_handle_buffer_credits(handle));
return ret;
}
@@ -2367,7 +2367,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
- int ret, start, orig_credits = handle->h_buffer_credits;
+ int ret, start, orig_credits = jbd2_handle_buffer_credits(handle);
u32 cpos;
struct ocfs2_path *left_path = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
@@ -3148,7 +3148,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle,
struct ocfs2_path *path,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
- int ret, orig_credits = handle->h_buffer_credits;
+ int ret, orig_credits = jbd2_handle_buffer_credits(handle);
struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
@@ -3386,8 +3386,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
- handle->h_buffer_credits,
- right_path);
+ jbd2_handle_buffer_credits(handle),
+ right_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3548,8 +3548,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
- handle->h_buffer_credits,
- left_path);
+ jbd2_handle_buffer_credits(handle),
+ left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3623,7 +3623,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
le16_to_cpu(el->l_next_free_rec) == 1) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
right_path);
if (ret) {
mlog_errno(ret);
@@ -3669,7 +3669,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -3725,7 +3725,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -3755,7 +3755,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -3799,7 +3799,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
if (ctxt->c_split_covers_rec) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -5358,7 +5358,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -5427,8 +5427,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
}
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
+ jbd2_handle_buffer_credits(handle),
+ path);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9cd0a6815933..3a67a6518ddf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -11,7 +11,6 @@
#include <linux/pagemap.h>
#include <asm/byteorder.h>
#include <linux/swap.h>
-#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e774c5ea13b..1c4c51f3df60 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1687,7 +1687,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
spin_unlock_irqrestore(&lockres->l_lock, flags);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (lockres->l_lockdep_map.key != NULL)
- rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+ rwsem_release(&lockres->l_lockdep_map, caller_ip);
#endif
}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index efeea208fdeb..89984172fc4a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -985,6 +985,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 1);
+ case FITRIM:
case OCFS2_IOC_MOVE_EXT:
break;
default:
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 699a560efbb0..1afe57f425a0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -420,14 +420,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
if (!nblocks)
return 0;
- old_nblocks = handle->h_buffer_credits;
+ old_nblocks = jbd2_handle_buffer_credits(handle);
trace_ocfs2_extend_trans(old_nblocks, nblocks);
#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
#else
- status = jbd2_journal_extend(handle, nblocks);
+ status = jbd2_journal_extend(handle, nblocks, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -461,13 +461,13 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
BUG_ON(!handle);
- old_nblks = handle->h_buffer_credits;
+ old_nblks = jbd2_handle_buffer_credits(handle);
trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
if (old_nblks < thresh)
return 0;
- status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+ status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 7a922190a8c7..eda83487c9ec 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -728,7 +728,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out;
/* Running from downconvert thread? Postpone quota processing to wq */
if (current == osb->dc_task) {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 69c21a3843af..4180c3ef0a68 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr)
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ struct journal_head *jh;
int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
@@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
if (!buffer_jbd(bg_bh))
return 1;
- jbd_lock_bh_state(bg_bh);
- bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+ jh = bh2jh(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
if (bg)
ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
else
ret = 1;
- jbd_unlock_bh_state(bg_bh);
+ spin_unlock(&jh->b_state_lock);
return ret;
}
@@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
int status;
unsigned int tmp;
struct ocfs2_group_desc *undo_bg = NULL;
+ struct journal_head *jh;
/* The caller got this descriptor from
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
@@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
goto bail;
}
+ jh = bh2jh(group_bh);
if (undo_fn) {
- jbd_lock_bh_state(group_bh);
- undo_bg = (struct ocfs2_group_desc *)
- bh2jh(group_bh)->b_committed_data;
+ spin_lock(&jh->b_state_lock);
+ undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
BUG_ON(!undo_bg);
}
@@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
if (undo_fn)
- jbd_unlock_bh_state(group_bh);
+ spin_unlock(&jh->b_state_lock);
return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
(unsigned long long)le64_to_cpu(bg->bg_blkno),
le16_to_cpu(bg->bg_bits),
@@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
if (undo_fn)
- jbd_unlock_bh_state(group_bh);
+ spin_unlock(&jh->b_state_lock);
ocfs2_journal_dirty(handle, group_bh);
bail:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c81e86c62380..05dd68ade293 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -926,8 +926,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
status = -ENOENT;
goto out_quota_off;
}
- status = dquot_enable(inode[type], type, QFMT_OCFS2,
- DQUOT_USAGE_ENABLED);
+ status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
if (status < 0)
goto out_quota_off;
}
diff --git a/fs/open.c b/fs/open.c
index 5c68282ea79e..b62f5c0923a8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -771,6 +771,10 @@ static int do_dentry_open(struct file *f,
f->f_mode |= FMODE_WRITER;
}
+ /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
+ f->f_mode |= FMODE_ATOMIC_POS;
+
f->f_op = fops_get(inode->i_fop);
if (WARN_ON(!f->f_op)) {
error = -ENODEV;
@@ -1252,7 +1256,7 @@ EXPORT_SYMBOL(nonseekable_open);
*/
int stream_open(struct inode *inode, struct file *filp)
{
- filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
filp->f_mode |= FMODE_STREAM;
return 0;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index a9149199e0e7..648ce440ca85 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -43,10 +43,12 @@ unsigned long pipe_user_pages_hard;
unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
/*
- * We use a start+len construction, which provides full use of the
- * allocated memory.
- * -- Florian Coosmann (FGC)
- *
+ * We use head and tail indices that aren't masked off, except at the point of
+ * dereference, but rather they're allowed to wrap naturally. This means there
+ * isn't a dead spot in the buffer, but the ring has to be a power of two and
+ * <= 2^31.
+ * -- David Howells 2019-09-23.
+ *
* Reads with count = 0 should always return 0.
* -- Julian Bradfield 1999-06-07.
*
@@ -285,10 +287,12 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = 0;
__pipe_lock(pipe);
for (;;) {
- int bufs = pipe->nrbufs;
- if (bufs) {
- int curbuf = pipe->curbuf;
- struct pipe_buffer *buf = pipe->bufs + curbuf;
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
+
+ if (!pipe_empty(head, tail)) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
@@ -320,18 +324,27 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
}
if (!buf->len) {
+ bool wake;
pipe_buf_release(pipe, buf);
- curbuf = (curbuf + 1) & (pipe->buffers - 1);
- pipe->curbuf = curbuf;
- pipe->nrbufs = --bufs;
+ spin_lock_irq(&pipe->wait.lock);
+ tail++;
+ pipe->tail = tail;
do_wakeup = 1;
+ wake = head - (tail - 1) == pipe->max_usage / 2;
+ if (wake)
+ wake_up_locked_poll(
+ &pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ spin_unlock_irq(&pipe->wait.lock);
+ if (wake)
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
+ if (!pipe_empty(head, tail)) /* More to do? */
+ continue;
}
- if (bufs) /* More to do? */
- continue;
+
if (!pipe->writers)
break;
if (!pipe->waiting_writers) {
@@ -352,17 +365,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = -ERESTARTSYS;
break;
}
- if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- }
pipe_wait(pipe);
}
__pipe_unlock(pipe);
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ wake_up_interruptible_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
@@ -380,6 +389,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
+ unsigned int head, max_usage, mask;
ssize_t ret = 0;
int do_wakeup = 0;
size_t total_len = iov_iter_count(from);
@@ -397,12 +407,14 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+ head = pipe->head;
+ max_usage = pipe->max_usage;
+ mask = pipe->ring_size - 1;
+
/* We try to merge small writes */
chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
- if (pipe->nrbufs && chars != 0) {
- int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
- (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + lastbuf;
+ if (!pipe_empty(head, pipe->tail) && chars != 0) {
+ struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
@@ -423,18 +435,16 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
}
for (;;) {
- int bufs;
-
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
- bufs = pipe->nrbufs;
- if (bufs < pipe->buffers) {
- int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+ head = pipe->head;
+ if (!pipe_full(head, pipe->tail, max_usage)) {
+ struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;
@@ -446,38 +456,64 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
}
pipe->tmp_page = page;
}
+
+ /* Allocate a slot in the ring in advance and attach an
+ * empty buffer. If we fault or otherwise fail to use
+ * it, either the reader will consume it or it'll still
+ * be there for the next write.
+ */
+ spin_lock_irq(&pipe->wait.lock);
+
+ head = pipe->head;
+ if (pipe_full(head, pipe->tail, max_usage)) {
+ spin_unlock_irq(&pipe->wait.lock);
+ continue;
+ }
+
+ pipe->head = head + 1;
+
/* Always wake up, even if the copy fails. Otherwise
* we lock up (O_NONBLOCK-)readers that sleep due to
* syscall merging.
* FIXME! Is this really true?
*/
- do_wakeup = 1;
- copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
- if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
- ret += copied;
+ wake_up_locked_poll(
+ &pipe->wait, EPOLLIN | EPOLLRDNORM);
+
+ spin_unlock_irq(&pipe->wait.lock);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
/* Insert it into the buffer array */
+ buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
- buf->len = copied;
+ buf->len = 0;
buf->flags = 0;
if (is_packetized(filp)) {
buf->ops = &packet_pipe_buf_ops;
buf->flags = PIPE_BUF_FLAG_PACKET;
}
- pipe->nrbufs = ++bufs;
pipe->tmp_page = NULL;
+ copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+ if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
+ if (!ret)
+ ret = -EFAULT;
+ break;
+ }
+ ret += copied;
+ buf->offset = 0;
+ buf->len = copied;
+
if (!iov_iter_count(from))
break;
}
- if (bufs < pipe->buffers)
+
+ if (!pipe_full(head, pipe->tail, max_usage))
continue;
+
+ /* Wait for buffer space to become available. */
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
@@ -488,11 +524,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
ret = -ERESTARTSYS;
break;
}
- if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- do_wakeup = 0;
- }
pipe->waiting_writers++;
pipe_wait(pipe);
pipe->waiting_writers--;
@@ -500,7 +531,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
out:
__pipe_unlock(pipe);
if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+ wake_up_interruptible_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
@@ -515,17 +546,19 @@ out:
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- int count, buf, nrbufs;
+ int count, head, tail, mask;
switch (cmd) {
case FIONREAD:
__pipe_lock(pipe);
count = 0;
- buf = pipe->curbuf;
- nrbufs = pipe->nrbufs;
- while (--nrbufs >= 0) {
- count += pipe->bufs[buf].len;
- buf = (buf+1) & (pipe->buffers - 1);
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+
+ while (tail != head) {
+ count += pipe->bufs[tail & mask].len;
+ tail++;
}
__pipe_unlock(pipe);
@@ -541,21 +574,25 @@ pipe_poll(struct file *filp, poll_table *wait)
{
__poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- int nrbufs;
+ unsigned int head = READ_ONCE(pipe->head);
+ unsigned int tail = READ_ONCE(pipe->tail);
poll_wait(filp, &pipe->wait, wait);
+ BUG_ON(pipe_occupancy(head, tail) > pipe->ring_size);
+
/* Reading only -- no need for acquiring the semaphore. */
- nrbufs = pipe->nrbufs;
mask = 0;
if (filp->f_mode & FMODE_READ) {
- mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
+ if (!pipe_empty(head, tail))
+ mask |= EPOLLIN | EPOLLRDNORM;
if (!pipe->writers && filp->f_version != pipe->w_counter)
mask |= EPOLLHUP;
}
if (filp->f_mode & FMODE_WRITE) {
- mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0;
+ if (!pipe_full(head, tail, pipe->max_usage))
+ mask |= EPOLLOUT | EPOLLWRNORM;
/*
* Most Unices do not set EPOLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
@@ -679,7 +716,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = pipe_bufs;
+ pipe->max_usage = pipe_bufs;
+ pipe->ring_size = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
@@ -697,9 +735,9 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- (void) account_pipe_buffers(pipe->user, pipe->buffers, 0);
+ (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
free_uid(pipe->user);
- for (i = 0; i < pipe->buffers; i++) {
+ for (i = 0; i < pipe->ring_size; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
pipe_buf_release(pipe, buf);
@@ -882,7 +920,7 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
- int cur = *cnt;
+ int cur = *cnt;
while (cur == *cnt) {
pipe_wait(pipe);
@@ -957,7 +995,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
}
}
break;
-
+
case FMODE_WRITE:
/*
* O_WRONLY
@@ -977,7 +1015,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
goto err_wr;
}
break;
-
+
case FMODE_READ | FMODE_WRITE:
/*
* O_RDWR
@@ -1056,14 +1094,14 @@ unsigned int round_pipe_size(unsigned long size)
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
struct pipe_buffer *bufs;
- unsigned int size, nr_pages;
+ unsigned int size, nr_slots, head, tail, mask, n;
unsigned long user_bufs;
long ret = 0;
size = round_pipe_size(arg);
- nr_pages = size >> PAGE_SHIFT;
+ nr_slots = size >> PAGE_SHIFT;
- if (!nr_pages)
+ if (!nr_slots)
return -EINVAL;
/*
@@ -1073,13 +1111,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
* Decreasing the pipe capacity is always permitted, even
* if the user is currently over a limit.
*/
- if (nr_pages > pipe->buffers &&
+ if (nr_slots > pipe->ring_size &&
size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
return -EPERM;
- user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
+ user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
- if (nr_pages > pipe->buffers &&
+ if (nr_slots > pipe->ring_size &&
(too_many_pipe_buffers_hard(user_bufs) ||
too_many_pipe_buffers_soft(user_bufs)) &&
is_unprivileged_user()) {
@@ -1088,17 +1126,21 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
}
/*
- * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
- * expect a lot of shrink+grow operations, just free and allocate
- * again like we would do for growing. If the pipe currently
+ * We can shrink the pipe, if arg is greater than the ring occupancy.
+ * Since we don't expect a lot of shrink+grow operations, just free and
+ * allocate again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
- if (nr_pages < pipe->nrbufs) {
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+ n = pipe_occupancy(pipe->head, pipe->tail);
+ if (nr_slots < n) {
ret = -EBUSY;
goto out_revert_acct;
}
- bufs = kcalloc(nr_pages, sizeof(*bufs),
+ bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs)) {
ret = -ENOMEM;
@@ -1107,33 +1149,37 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
/*
* The pipe array wraps around, so just start the new one at zero
- * and adjust the indexes.
+ * and adjust the indices.
*/
- if (pipe->nrbufs) {
- unsigned int tail;
- unsigned int head;
-
- tail = pipe->curbuf + pipe->nrbufs;
- if (tail < pipe->buffers)
- tail = 0;
- else
- tail &= (pipe->buffers - 1);
-
- head = pipe->nrbufs - tail;
- if (head)
- memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
- if (tail)
- memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+ if (n > 0) {
+ unsigned int h = head & mask;
+ unsigned int t = tail & mask;
+ if (h > t) {
+ memcpy(bufs, pipe->bufs + t,
+ n * sizeof(struct pipe_buffer));
+ } else {
+ unsigned int tsize = pipe->ring_size - t;
+ if (h > 0)
+ memcpy(bufs + tsize, pipe->bufs,
+ h * sizeof(struct pipe_buffer));
+ memcpy(bufs, pipe->bufs + t,
+ tsize * sizeof(struct pipe_buffer));
+ }
}
- pipe->curbuf = 0;
+ head = n;
+ tail = 0;
+
kfree(pipe->bufs);
pipe->bufs = bufs;
- pipe->buffers = nr_pages;
- return nr_pages * PAGE_SIZE;
+ pipe->ring_size = nr_slots;
+ pipe->max_usage = nr_slots;
+ pipe->tail = tail;
+ pipe->head = head;
+ return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
- (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
+ (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
return ret;
}
@@ -1163,7 +1209,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
ret = pipe_set_size(pipe, arg);
break;
case F_GETPIPE_SZ:
- ret = pipe->buffers * PAGE_SIZE;
+ ret = pipe->max_usage * PAGE_SIZE;
break;
default:
ret = -EINVAL;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 80c305f206bb..37bdbec5b402 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -120,20 +120,23 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
- struct kernel_cpustat *kcs = &kcpustat_cpu(i);
-
- user += kcs->cpustat[CPUTIME_USER];
- nice += kcs->cpustat[CPUTIME_NICE];
- system += kcs->cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(kcs, i);
- iowait += get_iowait_time(kcs, i);
- irq += kcs->cpustat[CPUTIME_IRQ];
- softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
- steal += kcs->cpustat[CPUTIME_STEAL];
- guest += kcs->cpustat[CPUTIME_GUEST];
- guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
- sum += kstat_cpu_irqs_sum(i);
- sum += arch_irq_stat_cpu(i);
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ nice += cpustat[CPUTIME_NICE];
+ system += cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(&kcpustat, i);
+ iowait += get_iowait_time(&kcpustat, i);
+ irq += cpustat[CPUTIME_IRQ];
+ softirq += cpustat[CPUTIME_SOFTIRQ];
+ steal += cpustat[CPUTIME_STEAL];
+ guest += cpustat[CPUTIME_GUEST];
+ guest_nice += cpustat[CPUTIME_USER];
+ sum += kstat_cpu_irqs_sum(i);
+ sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -157,19 +160,22 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
- struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcs->cpustat[CPUTIME_USER];
- nice = kcs->cpustat[CPUTIME_NICE];
- system = kcs->cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(kcs, i);
- iowait = get_iowait_time(kcs, i);
- irq = kcs->cpustat[CPUTIME_IRQ];
- softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
- steal = kcs->cpustat[CPUTIME_STEAL];
- guest = kcs->cpustat[CPUTIME_GUEST];
- guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
+ user = cpustat[CPUTIME_USER];
+ nice = cpustat[CPUTIME_NICE];
+ system = cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(&kcpustat, i);
+ iowait = get_iowait_time(&kcpustat, i);
+ irq = cpustat[CPUTIME_IRQ];
+ softirq = cpustat[CPUTIME_SOFTIRQ];
+ steal = cpustat[CPUTIME_STEAL];
+ guest = cpustat[CPUTIME_GUEST];
+ guest_nice = cpustat[CPUTIME_USER];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 6e826b454082..4639d53e96a3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -497,7 +497,7 @@ int dquot_release(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out_dqlock;
if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
@@ -595,7 +595,6 @@ int dquot_scan_active(struct super_block *sb,
/* Now we have active dquot so we can just increase use count */
atomic_inc(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- dqstats_inc(DQST_LOOKUPS);
dqput(old_dquot);
old_dquot = dquot;
/*
@@ -623,7 +622,7 @@ EXPORT_SYMBOL(dquot_scan_active);
/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
- struct list_head *dirty;
+ struct list_head dirty;
struct dquot *dquot;
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
@@ -637,9 +636,10 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
if (!sb_has_quota_active(sb, cnt))
continue;
spin_lock(&dq_list_lock);
- dirty = &dqopt->info[cnt].dqi_dirty_list;
- while (!list_empty(dirty)) {
- dquot = list_first_entry(dirty, struct dquot,
+ /* Move list away to avoid livelock. */
+ list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
+ while (!list_empty(&dirty)) {
+ dquot = list_first_entry(&dirty, struct dquot,
dq_dirty);
WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
@@ -649,7 +649,6 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
* use count */
dqgrab(dquot);
spin_unlock(&dq_list_lock);
- dqstats_inc(DQST_LOOKUPS);
err = sb->dq_op->write_dquot(dquot);
if (err) {
/*
@@ -2162,14 +2161,29 @@ int dquot_file_open(struct inode *inode, struct file *file)
}
EXPORT_SYMBOL(dquot_file_open);
+static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct inode *inode = dqopt->files[type];
+
+ if (!inode)
+ return;
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ inode_lock(inode);
+ inode->i_flags &= ~S_NOQUOTA;
+ inode_unlock(inode);
+ }
+ dqopt->files[type] = NULL;
+ iput(inode);
+}
+
/*
* Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
*/
int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
- int cnt, ret = 0;
+ int cnt;
struct quota_info *dqopt = sb_dqopt(sb);
- struct inode *toputinode[MAXQUOTAS];
/* s_umount should be held in exclusive mode */
if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
@@ -2191,7 +2205,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- toputinode[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
if (!sb_has_quota_loaded(sb, cnt))
@@ -2211,8 +2224,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
dqopt->flags &= ~dquot_state_flag(
DQUOT_SUSPENDED, cnt);
spin_unlock(&dq_state_lock);
- iput(dqopt->files[cnt]);
- dqopt->files[cnt] = NULL;
+ vfs_cleanup_quota_inode(sb, cnt);
continue;
}
spin_unlock(&dq_state_lock);
@@ -2234,10 +2246,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
if (dqopt->ops[cnt]->free_file_info)
dqopt->ops[cnt]->free_file_info(sb, cnt);
put_quota_format(dqopt->info[cnt].dqi_format);
-
- toputinode[cnt] = dqopt->files[cnt];
- if (!sb_has_quota_loaded(sb, cnt))
- dqopt->files[cnt] = NULL;
dqopt->info[cnt].dqi_flags = 0;
dqopt->info[cnt].dqi_igrace = 0;
dqopt->info[cnt].dqi_bgrace = 0;
@@ -2259,32 +2267,22 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
* must also discard the blockdev buffers so that we see the
* changes done by userspace on the next quotaon() */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- /* This can happen when suspending quotas on remount-ro... */
- if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) {
- inode_lock(toputinode[cnt]);
- toputinode[cnt]->i_flags &= ~S_NOQUOTA;
- truncate_inode_pages(&toputinode[cnt]->i_data, 0);
- inode_unlock(toputinode[cnt]);
- mark_inode_dirty_sync(toputinode[cnt]);
+ if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) {
+ inode_lock(dqopt->files[cnt]);
+ truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
+ inode_unlock(dqopt->files[cnt]);
}
if (sb->s_bdev)
invalidate_bdev(sb->s_bdev);
put_inodes:
+ /* We are done when suspending quotas */
+ if (flags & DQUOT_SUSPENDED)
+ return 0;
+
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if (toputinode[cnt]) {
- /* On remount RO, we keep the inode pointer so that we
- * can reenable quota on the subsequent remount RW. We
- * have to check 'flags' variable and not use sb_has_
- * function because another quotaon / quotaoff could
- * change global state before we got here. We refuse
- * to suspend quotas when there is pending delete on
- * the quota file... */
- if (!(flags & DQUOT_SUSPENDED))
- iput(toputinode[cnt]);
- else if (!toputinode[cnt]->i_nlink)
- ret = -EBUSY;
- }
- return ret;
+ if (!sb_has_quota_loaded(sb, cnt))
+ vfs_cleanup_quota_inode(sb, cnt);
+ return 0;
}
EXPORT_SYMBOL(dquot_disable);
@@ -2299,28 +2297,52 @@ EXPORT_SYMBOL(dquot_quota_off);
* Turn quotas on on a device
*/
-/*
- * Helper function to turn quotas on when we already have the inode of
- * quota file and no quota information is loaded.
- */
-static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+static int vfs_setup_quota_inode(struct inode *inode, int type)
+{
+ struct super_block *sb = inode->i_sb;
+ struct quota_info *dqopt = sb_dqopt(sb);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EACCES;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ if (sb_has_quota_loaded(sb, type))
+ return -EBUSY;
+
+ dqopt->files[type] = igrab(inode);
+ if (!dqopt->files[type])
+ return -EIO;
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ /* We don't want quota and atime on quota files (deadlocks
+ * possible) Also nobody should write to the file - we use
+ * special IO operations which ignore the immutable bit. */
+ inode_lock(inode);
+ inode->i_flags |= S_NOQUOTA;
+ inode_unlock(inode);
+ /*
+ * When S_NOQUOTA is set, remove dquot references as no more
+ * references can be added
+ */
+ __dquot_drop(inode);
+ }
+ return 0;
+}
+
+int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
struct quota_format_type *fmt = find_quota_format(format_id);
- struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
int error;
+ /* Just unsuspend quotas? */
+ BUG_ON(flags & DQUOT_SUSPENDED);
+ /* s_umount should be held in exclusive mode */
+ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+ up_read(&sb->s_umount);
+
if (!fmt)
return -ESRCH;
- if (!S_ISREG(inode->i_mode)) {
- error = -EACCES;
- goto out_fmt;
- }
- if (IS_RDONLY(inode)) {
- error = -EROFS;
- goto out_fmt;
- }
if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
(type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
error = -EINVAL;
@@ -2352,27 +2374,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
invalidate_bdev(sb->s_bdev);
}
- if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
- /* We don't want quota and atime on quota files (deadlocks
- * possible) Also nobody should write to the file - we use
- * special IO operations which ignore the immutable bit. */
- inode_lock(inode);
- inode->i_flags |= S_NOQUOTA;
- inode_unlock(inode);
- /*
- * When S_NOQUOTA is set, remove dquot references as no more
- * references can be added
- */
- __dquot_drop(inode);
- }
-
- error = -EIO;
- dqopt->files[type] = igrab(inode);
- if (!dqopt->files[type])
- goto out_file_flags;
error = -EINVAL;
if (!fmt->qf_ops->check_quota_file(sb, type))
- goto out_file_init;
+ goto out_fmt;
dqopt->ops[type] = fmt->qf_ops;
dqopt->info[type].dqi_format = fmt;
@@ -2380,7 +2384,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
error = dqopt->ops[type]->read_file_info(sb, type);
if (error < 0)
- goto out_file_init;
+ goto out_fmt;
if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
spin_lock(&dq_data_lock);
dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
@@ -2395,24 +2399,36 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
dquot_disable(sb, type, flags);
return error;
-out_file_init:
- dqopt->files[type] = NULL;
- iput(inode);
-out_file_flags:
- inode_lock(inode);
- inode->i_flags &= ~S_NOQUOTA;
- inode_unlock(inode);
out_fmt:
put_quota_format(fmt);
return error;
}
+EXPORT_SYMBOL(dquot_load_quota_sb);
+
+/*
+ * More powerful function for turning on quotas on given quota inode allowing
+ * setting of individual quota flags
+ */
+int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
+ unsigned int flags)
+{
+ int err;
+
+ err = vfs_setup_quota_inode(inode, type);
+ if (err < 0)
+ return err;
+ err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
+ if (err < 0)
+ vfs_cleanup_quota_inode(inode->i_sb, type);
+ return err;
+}
+EXPORT_SYMBOL(dquot_load_quota_inode);
/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
- struct inode *inode;
int ret = 0, cnt;
unsigned int flags;
@@ -2426,8 +2442,6 @@ int dquot_resume(struct super_block *sb, int type)
if (!sb_has_quota_suspended(sb, cnt))
continue;
- inode = dqopt->files[cnt];
- dqopt->files[cnt] = NULL;
spin_lock(&dq_state_lock);
flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED,
@@ -2436,9 +2450,10 @@ int dquot_resume(struct super_block *sb, int type)
spin_unlock(&dq_state_lock);
flags = dquot_generic_flag(flags, cnt);
- ret = vfs_load_quota_inode(inode, cnt,
- dqopt->info[cnt].dqi_fmt_id, flags);
- iput(inode);
+ ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
+ flags);
+ if (ret < 0)
+ vfs_cleanup_quota_inode(sb, type);
}
return ret;
@@ -2455,7 +2470,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id,
if (path->dentry->d_sb != sb)
error = -EXDEV;
else
- error = vfs_load_quota_inode(d_inode(path->dentry), type,
+ error = dquot_load_quota_inode(d_inode(path->dentry), type,
format_id, DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED);
return error;
@@ -2463,41 +2478,6 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id,
EXPORT_SYMBOL(dquot_quota_on);
/*
- * More powerful function for turning on quotas allowing setting
- * of individual quota flags
- */
-int dquot_enable(struct inode *inode, int type, int format_id,
- unsigned int flags)
-{
- struct super_block *sb = inode->i_sb;
-
- /* Just unsuspend quotas? */
- BUG_ON(flags & DQUOT_SUSPENDED);
- /* s_umount should be held in exclusive mode */
- if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
- up_read(&sb->s_umount);
-
- if (!flags)
- return 0;
- /* Just updating flags needed? */
- if (sb_has_quota_loaded(sb, type)) {
- if (flags & DQUOT_USAGE_ENABLED &&
- sb_has_quota_usage_enabled(sb, type))
- return -EBUSY;
- if (flags & DQUOT_LIMITS_ENABLED &&
- sb_has_quota_limits_enabled(sb, type))
- return -EBUSY;
- spin_lock(&dq_state_lock);
- sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
- spin_unlock(&dq_state_lock);
- return 0;
- }
-
- return vfs_load_quota_inode(inode, type, format_id, flags);
-}
-EXPORT_SYMBOL(dquot_enable);
-
-/*
* This function is used when filesystem needs to initialize quotas
* during mount time.
*/
@@ -2518,7 +2498,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
error = security_quota_on(dentry);
if (!error)
- error = vfs_load_quota_inode(d_inode(dentry), type, format_id,
+ error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
out:
@@ -2543,13 +2523,17 @@ static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
if (!(flags & qtype_enforce_flag(type)))
continue;
/* Can't enforce without accounting */
- if (!sb_has_quota_usage_enabled(sb, type))
- return -EINVAL;
- ret = dquot_enable(dqopt->files[type], type,
- dqopt->info[type].dqi_fmt_id,
- DQUOT_LIMITS_ENABLED);
- if (ret < 0)
+ if (!sb_has_quota_usage_enabled(sb, type)) {
+ ret = -EINVAL;
goto out_err;
+ }
+ if (sb_has_quota_limits_enabled(sb, type)) {
+ ret = -EBUSY;
+ goto out_err;
+ }
+ spin_lock(&dq_state_lock);
+ dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
+ spin_unlock(&dq_state_lock);
}
return 0;
out_err:
@@ -2599,10 +2583,12 @@ static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
out_err:
/* Backout enforcement disabling we already did */
for (type--; type >= 0; type--) {
- if (flags & qtype_enforce_flag(type))
- dquot_enable(dqopt->files[type], type,
- dqopt->info[type].dqi_fmt_id,
- DQUOT_LIMITS_ENABLED);
+ if (flags & qtype_enforce_flag(type)) {
+ spin_lock(&dq_state_lock);
+ dqopt->flags |=
+ dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
+ spin_unlock(&dq_state_lock);
+ }
}
return ret;
}
@@ -2800,8 +2786,10 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state)
tstate->flags |= QCI_LIMITS_ENFORCED;
tstate->spc_timelimit = mi->dqi_bgrace;
tstate->ino_timelimit = mi->dqi_igrace;
- tstate->ino = dqopt->files[type]->i_ino;
- tstate->blocks = dqopt->files[type]->i_blocks;
+ if (dqopt->files[type]) {
+ tstate->ino = dqopt->files[type]->i_ino;
+ tstate->blocks = dqopt->files[type]->i_blocks;
+ }
tstate->nextents = 1; /* We don't know... */
spin_unlock(&dq_data_lock);
}
@@ -2860,68 +2848,73 @@ EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
static int do_proc_dqstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- unsigned int type = (int *)table->data - dqstats.stat;
+ unsigned int type = (unsigned long *)table->data - dqstats.stat;
+ s64 value = percpu_counter_sum(&dqstats.counter[type]);
+
+ /* Filter negative values for non-monotonic counters */
+ if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
+ type == DQST_FREE_DQUOTS))
+ value = 0;
/* Update global table */
- dqstats.stat[type] =
- percpu_counter_sum_positive(&dqstats.counter[type]);
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ dqstats.stat[type] = value;
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
static struct ctl_table fs_dqstats_table[] = {
{
.procname = "lookups",
.data = &dqstats.stat[DQST_LOOKUPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "drops",
.data = &dqstats.stat[DQST_DROPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "reads",
.data = &dqstats.stat[DQST_READS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "writes",
.data = &dqstats.stat[DQST_WRITES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "cache_hits",
.data = &dqstats.stat[DQST_CACHE_HITS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "allocated_dquots",
.data = &dqstats.stat[DQST_ALLOC_DQUOTS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "free_dquots",
.data = &dqstats.stat[DQST_FREE_DQUOTS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "syncs",
.data = &dqstats.stat[DQST_SYNCS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
@@ -2983,11 +2976,7 @@ static int __init dquot_init(void)
/* Find power-of-two hlist_heads which can fit into allocation */
nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
- dq_hash_bits = 0;
- do {
- dq_hash_bits++;
- } while (nr_hash >> dq_hash_bits);
- dq_hash_bits--;
+ dq_hash_bits = ilog2(nr_hash);
nr_hash = 1UL << dq_hash_bits;
dq_hash_mask = nr_hash - 1;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index cb13fb76dbee..5444d3c4d93f 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -60,8 +60,6 @@ static int quota_sync_all(int type)
{
int ret;
- if (type >= MAXQUOTAS)
- return -EINVAL;
ret = security_quotactl(Q_SYNC, type, 0, NULL);
if (!ret)
iterate_supers(quota_sync_one, &type);
@@ -686,8 +684,6 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
{
int ret;
- if (type >= MAXQUOTAS)
- return -EINVAL;
type = array_index_nospec(type, MAXQUOTAS);
/*
* Quota not supported on this fs? Check this before s_quota_types
@@ -831,6 +827,9 @@ int kernel_quotactl(unsigned int cmd, const char __user *special,
cmds = cmd >> SUBCMDSHIFT;
type = cmd & SUBCMDMASK;
+ if (type >= MAXQUOTAS)
+ return -EINVAL;
+
/*
* As a special case Q_SYNC can be called without a specific device.
* It will iterate all superblocks that have quota enabled and call
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index c740e5572eb8..cd92e5fa0062 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -217,7 +217,6 @@ static const struct quota_format_ops v1_format_ops = {
.check_quota_file = v1_check_quota_file,
.read_file_info = v1_read_file_info,
.write_file_info = v1_write_file_info,
- .free_file_info = NULL,
.read_dqblk = v1_read_dqblk,
.commit_dqblk = v1_commit_dqblk,
};
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 843aadcc123c..84cf8bdbec9c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -38,16 +38,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
BUG_ON(!S_ISREG(inode->i_mode));
- if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
+ if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers,
+ &REISERFS_I(inode)->tailpack))
return 0;
- mutex_lock(&REISERFS_I(inode)->tailpack);
-
- if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
- mutex_unlock(&REISERFS_I(inode)->tailpack);
- return 0;
- }
-
/* fast out for when nothing needs to be done */
if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
!tail_has_to_be_packed(inode)) &&
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 132ec4406ed0..6419e6dacc39 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2097,6 +2097,15 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_inserted_sd;
}
+ /*
+ * Mark it private if we're creating the privroot
+ * or something under it.
+ */
+ if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) {
+ inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
+
if (reiserfs_posixacl(inode->i_sb)) {
reiserfs_write_unlock(inode->i_sb);
retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
@@ -2111,8 +2120,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
reiserfs_warning(inode->i_sb, "jdm-13090",
"ACLs aren't enabled in the fs, "
"but vfs thinks they are!");
- } else if (IS_PRIVATE(dir))
- inode->i_flags |= S_PRIVATE;
+ }
if (security->name) {
reiserfs_write_unlock(inode->i_sb);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 97f3fc4fdd79..959a066b7bb0 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -377,10 +377,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
/*
* Propagate the private flag so we know we're
- * in the priv tree
+ * in the priv tree. Also clear IOP_XATTR
+ * since we don't have xattrs on xattr files.
*/
- if (IS_PRIVATE(dir))
+ if (IS_PRIVATE(dir)) {
inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
}
reiserfs_write_unlock(dir->i_sb);
if (retval == IO_ERROR) {
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index e5ca9ed79e54..726580114d55 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1168,6 +1168,8 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
+extern const struct xattr_handler *reiserfs_xattr_handlers[];
+
/*
* this says about version of key of all items (but stat data) the
* object consists of
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index d69b4ac0ae2f..3244037b1286 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2049,6 +2049,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (replay_only(s))
goto error_unlocked;
+ s->s_xattr = reiserfs_xattr_handlers;
+
if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
SWARN(silent, s, "clm-7000",
"Detected readonly device, marking FS readonly");
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index b5b26d8a192c..62b40df36c98 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -122,13 +122,13 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
struct dentry *xaroot;
if (d_really_is_negative(privroot))
- return ERR_PTR(-ENODATA);
+ return ERR_PTR(-EOPNOTSUPP);
inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
xaroot = dget(REISERFS_SB(sb)->xattr_root);
if (!xaroot)
- xaroot = ERR_PTR(-ENODATA);
+ xaroot = ERR_PTR(-EOPNOTSUPP);
else if (d_really_is_negative(xaroot)) {
int err = -ENODATA;
@@ -619,6 +619,10 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
int error, error2;
size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+ /* Check before we start a transaction and then do nothing. */
+ if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
+ return -EOPNOTSUPP;
+
if (!(flags & XATTR_REPLACE))
jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
@@ -841,8 +845,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
if (d_really_is_negative(dentry))
return -EINVAL;
- if (!dentry->d_sb->s_xattr ||
- get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+ if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
@@ -882,6 +885,7 @@ static int create_privroot(struct dentry *dentry)
}
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
"storage.\n", PRIVROOT_NAME);
@@ -895,7 +899,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-static const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
@@ -966,8 +970,10 @@ int reiserfs_lookup_privroot(struct super_block *s)
if (!IS_ERR(dentry)) {
REISERFS_SB(s)->priv_root = dentry;
d_set_d_op(dentry, &xattr_lookup_poison_ops);
- if (d_really_is_positive(dentry))
+ if (d_really_is_positive(dentry)) {
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
+ }
} else
err = PTR_ERR(dentry);
inode_unlock(d_inode(s->s_root));
@@ -996,7 +1002,6 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
}
if (d_really_is_positive(privroot)) {
- s->s_xattr = reiserfs_xattr_handlers;
inode_lock(d_inode(privroot));
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index aa9380bac196..05f666794561 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -320,10 +320,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
* would be useless since permissions are ignored, and a pain because
* it introduces locking cycles
*/
- if (IS_PRIVATE(dir)) {
- inode->i_flags |= S_PRIVATE;
+ if (IS_PRIVATE(inode))
goto apply_umask;
- }
err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (err)
diff --git a/fs/select.c b/fs/select.c
index 53a0c149f528..11d0285d46b7 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -321,7 +321,7 @@ static int poll_select_finish(struct timespec64 *end_time,
switch (pt_type) {
case PT_TIMEVAL:
{
- struct timeval rtv;
+ struct __kernel_old_timeval rtv;
if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
memset(&rtv, 0, sizeof(rtv));
@@ -698,10 +698,10 @@ out_nofds:
}
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timeval __user *tvp)
+ fd_set __user *exp, struct __kernel_old_timeval __user *tvp)
{
struct timespec64 end_time, *to = NULL;
- struct timeval tv;
+ struct __kernel_old_timeval tv;
int ret;
if (tvp) {
@@ -720,7 +720,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
}
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
- fd_set __user *, exp, struct timeval __user *, tvp)
+ fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp)
{
return kern_select(n, inp, outp, exp, tvp);
}
@@ -810,7 +810,7 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *,
struct sel_arg_struct {
unsigned long n;
fd_set __user *inp, *outp, *exp;
- struct timeval __user *tvp;
+ struct __kernel_old_timeval __user *tvp;
};
SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
diff --git a/fs/splice.c b/fs/splice.c
index 98412721f056..f2400ce7d528 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd)
{
unsigned int spd_pages = spd->nr_pages;
+ unsigned int tail = pipe->tail;
+ unsigned int head = pipe->head;
+ unsigned int mask = pipe->ring_size - 1;
int ret = 0, page_nr = 0;
if (!spd_pages)
@@ -196,9 +199,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
goto out;
}
- while (pipe->nrbufs < pipe->buffers) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
+ while (!pipe_full(head, tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = &pipe->bufs[head & mask];
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
@@ -207,7 +209,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
buf->ops = spd->ops;
buf->flags = 0;
- pipe->nrbufs++;
+ head++;
+ pipe->head = head;
page_nr++;
ret += buf->len;
@@ -228,17 +231,19 @@ EXPORT_SYMBOL_GPL(splice_to_pipe);
ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
int ret;
if (unlikely(!pipe->readers)) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
- } else if (pipe->nrbufs == pipe->buffers) {
+ } else if (pipe_full(head, tail, pipe->max_usage)) {
ret = -EAGAIN;
} else {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- pipe->bufs[newbuf] = *buf;
- pipe->nrbufs++;
+ pipe->bufs[head & mask] = *buf;
+ pipe->head = head + 1;
return buf->len;
}
pipe_buf_release(pipe, buf);
@@ -252,14 +257,14 @@ EXPORT_SYMBOL(add_to_pipe);
*/
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
{
- unsigned int buffers = READ_ONCE(pipe->buffers);
+ unsigned int max_usage = READ_ONCE(pipe->max_usage);
- spd->nr_pages_max = buffers;
- if (buffers <= PIPE_DEF_BUFFERS)
+ spd->nr_pages_max = max_usage;
+ if (max_usage <= PIPE_DEF_BUFFERS)
return 0;
- spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL);
- spd->partial = kmalloc_array(buffers, sizeof(struct partial_page),
+ spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
+ spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
GFP_KERNEL);
if (spd->pages && spd->partial)
@@ -298,10 +303,11 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
{
struct iov_iter to;
struct kiocb kiocb;
- int idx, ret;
+ unsigned int i_head;
+ int ret;
iov_iter_pipe(&to, READ, pipe, len);
- idx = to.idx;
+ i_head = to.head;
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -309,7 +315,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
- to.idx = idx;
+ to.head = i_head;
to.iov_offset = 0;
iov_iter_advance(&to, 0); /* to free what was emitted */
/*
@@ -370,11 +376,12 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct iov_iter to;
struct page **pages;
unsigned int nr_pages;
+ unsigned int mask;
size_t offset, base, copied = 0;
ssize_t res;
int i;
- if (pipe->nrbufs == pipe->buffers)
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
return -EAGAIN;
/*
@@ -400,8 +407,9 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
}
}
- pipe->bufs[to.idx].offset = offset;
- pipe->bufs[to.idx].len -= offset;
+ mask = pipe->ring_size - 1;
+ pipe->bufs[to.head & mask].offset = offset;
+ pipe->bufs[to.head & mask].len -= offset;
for (i = 0; i < nr_pages; i++) {
size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
@@ -443,7 +451,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
- if (sd->len < sd->total_len && pipe->nrbufs > 1)
+ if (sd->len < sd->total_len &&
+ pipe_occupancy(pipe->head, pipe->tail) > 1)
more |= MSG_SENDPAGE_NOTLAST;
return file->f_op->sendpage(file, buf->page, buf->offset,
@@ -481,10 +490,13 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_actor *actor)
{
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
int ret;
- while (pipe->nrbufs) {
- struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ while (!pipe_empty(tail, head)) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
sd->len = buf->len;
if (sd->len > sd->total_len)
@@ -511,8 +523,8 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
if (!buf->len) {
pipe_buf_release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe->tail = tail;
if (pipe->files)
sd->need_wakeup = true;
}
@@ -543,7 +555,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
if (signal_pending(current))
return -ERESTARTSYS;
- while (!pipe->nrbufs) {
+ while (pipe_empty(pipe->head, pipe->tail)) {
if (!pipe->writers)
return 0;
@@ -686,7 +698,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
.pos = *ppos,
.u.file = out,
};
- int nbufs = pipe->buffers;
+ int nbufs = pipe->max_usage;
struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
GFP_KERNEL);
ssize_t ret;
@@ -699,16 +711,19 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
splice_from_pipe_begin(&sd);
while (sd.total_len) {
struct iov_iter from;
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
size_t left;
- int n, idx;
+ int n;
ret = splice_from_pipe_next(pipe, &sd);
if (ret <= 0)
break;
- if (unlikely(nbufs < pipe->buffers)) {
+ if (unlikely(nbufs < pipe->max_usage)) {
kfree(array);
- nbufs = pipe->buffers;
+ nbufs = pipe->max_usage;
array = kcalloc(nbufs, sizeof(struct bio_vec),
GFP_KERNEL);
if (!array) {
@@ -719,16 +734,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* build the vector */
left = sd.total_len;
- for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
- struct pipe_buffer *buf = pipe->bufs + idx;
+ for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t this_len = buf->len;
if (this_len > left)
this_len = left;
- if (idx == pipe->buffers - 1)
- idx = -1;
-
ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
if (ret == -ENODATA)
@@ -752,14 +764,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
*ppos = sd.pos;
/* dismiss the fully eaten buffers, adjust the partial one */
+ tail = pipe->tail;
while (ret) {
- struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
if (ret >= buf->len) {
ret -= buf->len;
buf->len = 0;
pipe_buf_release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe->tail = tail;
if (pipe->files)
sd.need_wakeup = true;
} else {
@@ -942,15 +955,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
sd->flags &= ~SPLICE_F_NONBLOCK;
more = sd->flags & SPLICE_F_MORE;
- WARN_ON_ONCE(pipe->nrbufs != 0);
+ WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
while (len) {
+ unsigned int p_space;
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
/* Don't try to read more the pipe has space for. */
- read_len = min_t(size_t, len,
- (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
+ p_space = pipe->max_usage -
+ pipe_occupancy(pipe->head, pipe->tail);
+ read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
ret = do_splice_to(in, &pos, pipe, read_len, flags);
if (unlikely(ret <= 0))
goto out_release;
@@ -989,7 +1004,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
}
done:
- pipe->nrbufs = pipe->curbuf = 0;
+ pipe->tail = pipe->head = 0;
file_accessed(in);
return bytes;
@@ -998,8 +1013,8 @@ out_release:
* If we did an incomplete transfer we must release
* the pipe buffers in question:
*/
- for (i = 0; i < pipe->buffers; i++) {
- struct pipe_buffer *buf = pipe->bufs + i;
+ for (i = 0; i < pipe->ring_size; i++) {
+ struct pipe_buffer *buf = &pipe->bufs[i];
if (buf->ops)
pipe_buf_release(pipe, buf);
@@ -1075,7 +1090,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
- if (pipe->nrbufs != pipe->buffers)
+ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
return 0;
if (flags & SPLICE_F_NONBLOCK)
return -EAGAIN;
@@ -1180,8 +1195,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
- if (!ret)
+ if (!ret) {
+ unsigned int p_space;
+
+ /* Don't try to read more the pipe has space for. */
+ p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);
+ len = min_t(size_t, len, p_space << PAGE_SHIFT);
+
ret = do_splice_to(in, &offset, opipe, len, flags);
+ }
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
@@ -1442,16 +1464,16 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
int ret;
/*
- * Check ->nrbufs without the inode lock first. This function
+ * Check the pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs)
+ if (!pipe_empty(pipe->head, pipe->tail))
return 0;
ret = 0;
pipe_lock(pipe);
- while (!pipe->nrbufs) {
+ while (pipe_empty(pipe->head, pipe->tail)) {
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
@@ -1480,16 +1502,16 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
int ret;
/*
- * Check ->nrbufs without the inode lock first. This function
+ * Check pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs < pipe->buffers)
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe->nrbufs >= pipe->buffers) {
+ while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
@@ -1520,7 +1542,10 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret = 0, nbuf;
+ unsigned int i_head, o_head;
+ unsigned int i_tail, o_tail;
+ unsigned int i_mask, o_mask;
+ int ret = 0;
bool input_wakeup = false;
@@ -1540,7 +1565,14 @@ retry:
*/
pipe_double_lock(ipipe, opipe);
+ i_tail = ipipe->tail;
+ i_mask = ipipe->ring_size - 1;
+ o_head = opipe->head;
+ o_mask = opipe->ring_size - 1;
+
do {
+ size_t o_len;
+
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
@@ -1548,14 +1580,18 @@ retry:
break;
}
- if (!ipipe->nrbufs && !ipipe->writers)
+ i_head = ipipe->head;
+ o_tail = opipe->tail;
+
+ if (pipe_empty(i_head, i_tail) && !ipipe->writers)
break;
/*
* Cannot make any progress, because either the input
* pipe is empty or the output pipe is full.
*/
- if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
+ if (pipe_empty(i_head, i_tail) ||
+ pipe_full(o_head, o_tail, opipe->max_usage)) {
/* Already processed some buffers, break */
if (ret)
break;
@@ -1575,9 +1611,8 @@ retry:
goto retry;
}
- ibuf = ipipe->bufs + ipipe->curbuf;
- nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
- obuf = opipe->bufs + nbuf;
+ ibuf = &ipipe->bufs[i_tail & i_mask];
+ obuf = &opipe->bufs[o_head & o_mask];
if (len >= ibuf->len) {
/*
@@ -1585,10 +1620,12 @@ retry:
*/
*obuf = *ibuf;
ibuf->ops = NULL;
- opipe->nrbufs++;
- ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
- ipipe->nrbufs--;
+ i_tail++;
+ ipipe->tail = i_tail;
input_wakeup = true;
+ o_len = obuf->len;
+ o_head++;
+ opipe->head = o_head;
} else {
/*
* Get a reference to this pipe buffer,
@@ -1610,12 +1647,14 @@ retry:
pipe_buf_mark_unmergeable(obuf);
obuf->len = len;
- opipe->nrbufs++;
- ibuf->offset += obuf->len;
- ibuf->len -= obuf->len;
+ ibuf->offset += len;
+ ibuf->len -= len;
+ o_len = len;
+ o_head++;
+ opipe->head = o_head;
}
- ret += obuf->len;
- len -= obuf->len;
+ ret += o_len;
+ len -= o_len;
} while (len);
pipe_unlock(ipipe);
@@ -1641,7 +1680,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret = 0, i = 0, nbuf;
+ unsigned int i_head, o_head;
+ unsigned int i_tail, o_tail;
+ unsigned int i_mask, o_mask;
+ int ret = 0;
/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1650,6 +1692,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
*/
pipe_double_lock(ipipe, opipe);
+ i_tail = ipipe->tail;
+ i_mask = ipipe->ring_size - 1;
+ o_head = opipe->head;
+ o_mask = opipe->ring_size - 1;
+
do {
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
@@ -1658,15 +1705,19 @@ static int link_pipe(struct pipe_inode_info *ipipe,
break;
}
+ i_head = ipipe->head;
+ o_tail = opipe->tail;
+
/*
- * If we have iterated all input buffers or ran out of
+ * If we have iterated all input buffers or run out of
* output room, break.
*/
- if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
+ if (pipe_empty(i_head, i_tail) ||
+ pipe_full(o_head, o_tail, opipe->max_usage))
break;
- ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
- nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+ ibuf = &ipipe->bufs[i_tail & i_mask];
+ obuf = &opipe->bufs[o_head & o_mask];
/*
* Get a reference to this pipe buffer,
@@ -1678,7 +1729,6 @@ static int link_pipe(struct pipe_inode_info *ipipe,
break;
}
- obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
/*
@@ -1691,11 +1741,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
if (obuf->len > len)
obuf->len = len;
-
- opipe->nrbufs++;
ret += obuf->len;
len -= obuf->len;
- i++;
+
+ o_head++;
+ opipe->head = o_head;
+ i_tail++;
} while (len);
/*
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 48305ba41e3c..ac7f59a58f94 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -302,11 +302,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
static void timerfd_show(struct seq_file *m, struct file *file)
{
struct timerfd_ctx *ctx = file->private_data;
- struct itimerspec t;
+ struct timespec64 value, interval;
spin_lock_irq(&ctx->wqh.lock);
- t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
- t.it_interval = ktime_to_timespec(ctx->tintv);
+ value = ktime_to_timespec64(timerfd_get_remaining(ctx));
+ interval = ktime_to_timespec64(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
seq_printf(m,
@@ -318,10 +318,10 @@ static void timerfd_show(struct seq_file *m, struct file *file)
ctx->clockid,
(unsigned long long)ctx->ticks,
ctx->settime_flags,
- (unsigned long long)t.it_value.tv_sec,
- (unsigned long long)t.it_value.tv_nsec,
- (unsigned long long)t.it_interval.tv_sec,
- (unsigned long long)t.it_interval.tv_nsec);
+ (unsigned long long)value.tv_sec,
+ (unsigned long long)value.tv_nsec,
+ (unsigned long long)interval.tv_sec,
+ (unsigned long long)interval.tv_nsec);
}
#else
#define timerfd_show NULL
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f9fd18670e22..37df7c9eedb1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
- new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ new_flags = (vma->vm_flags &
+ ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
@@ -1834,13 +1835,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
features = uffdio_api.features;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
- memset(&uffdio_api, 0, sizeof(uffdio_api));
- if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
- goto out;
- ret = -EINVAL;
- goto out;
- }
+ ret = -EINVAL;
+ if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ goto err_out;
+ ret = -EPERM;
+ if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
+ goto err_out;
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
uffdio_api.ioctls = UFFD_API_IOCTLS;
@@ -1853,6 +1853,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = 0;
out:
return ret;
+err_out:
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ ret = -EFAULT;
+ goto out;
}
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
@@ -1923,7 +1928,7 @@ static const struct file_operations userfaultfd_fops = {
.poll = userfaultfd_poll,
.read = userfaultfd_read,
.unlocked_ioctl = userfaultfd_ioctl,
- .compat_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = noop_llseek,
};
diff --git a/fs/utimes.c b/fs/utimes.c
index 1ba3f7883870..c952b6b3d8a0 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -161,9 +161,9 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
* utimensat() instead.
*/
static long do_futimesat(int dfd, const char __user *filename,
- struct timeval __user *utimes)
+ struct __kernel_old_timeval __user *utimes)
{
- struct timeval times[2];
+ struct __kernel_old_timeval times[2];
struct timespec64 tstimes[2];
if (utimes) {
@@ -190,13 +190,13 @@ static long do_futimesat(int dfd, const char __user *filename,
SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
- struct timeval __user *, utimes)
+ struct __kernel_old_timeval __user *, utimes)
{
return do_futimesat(dfd, filename, utimes);
}
SYSCALL_DEFINE2(utimes, char __user *, filename,
- struct timeval __user *, utimes)
+ struct __kernel_old_timeval __user *, utimes)
{
return do_futimesat(AT_FDCWD, filename, utimes);
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 02469d59c787..ef75e223cb70 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,6 +34,7 @@
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
+#include "xfs_iomap.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -4456,16 +4457,21 @@ int
xfs_bmapi_convert_delalloc(
struct xfs_inode *ip,
int whichfork,
- xfs_fileoff_t offset_fsb,
- struct xfs_bmbt_irec *imap,
+ xfs_off_t offset,
+ struct iomap *iomap,
unsigned int *seq)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
+ u16 flags = 0;
struct xfs_trans *tp;
int error;
+ if (whichfork == XFS_COW_FORK)
+ flags |= IOMAP_F_SHARED;
+
/*
* Space for the extent and indirect blocks was reserved when the
* delalloc extent was created so there's no need to do so here.
@@ -4495,7 +4501,7 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- *imap = bma.got;
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4528,7 +4534,7 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- *imap = bma.got;
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e2798c6f3a5f..14d25e0b7d9c 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -228,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
- unsigned int *seq);
+ xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f16d5f196c6b..5936507c6f50 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -18,17 +18,18 @@
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
-/*
- * structure owned by writepages passed to individual writepage calls
- */
struct xfs_writepage_ctx {
- struct xfs_bmbt_irec imap;
- int fork;
+ struct iomap_writepage_ctx ctx;
unsigned int data_seq;
unsigned int cow_seq;
- struct xfs_ioend *ioend;
};
+static inline struct xfs_writepage_ctx *
+XFS_WPC(struct iomap_writepage_ctx *ctx)
+{
+ return container_of(ctx, struct xfs_writepage_ctx, ctx);
+}
+
struct block_device *
xfs_find_bdev_for_inode(
struct inode *inode)
@@ -55,71 +56,10 @@ xfs_find_daxdev_for_inode(
return mp->m_ddev_targp->bt_daxdev;
}
-static void
-xfs_finish_page_writeback(
- struct inode *inode,
- struct bio_vec *bvec,
- int error)
-{
- struct iomap_page *iop = to_iomap_page(bvec->bv_page);
-
- if (error) {
- SetPageError(bvec->bv_page);
- mapping_set_error(inode->i_mapping, -EIO);
- }
-
- ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
- ASSERT(!iop || atomic_read(&iop->write_count) > 0);
-
- if (!iop || atomic_dec_and_test(&iop->write_count))
- end_page_writeback(bvec->bv_page);
-}
-
-/*
- * We're now finished for good with this ioend structure. Update the page
- * state, release holds on bios, and finally free up memory. Do not use the
- * ioend after this.
- */
-STATIC void
-xfs_destroy_ioend(
- struct xfs_ioend *ioend,
- int error)
-{
- struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_inline_bio;
- struct bio *last = ioend->io_bio, *next;
- u64 start = bio->bi_iter.bi_sector;
- bool quiet = bio_flagged(bio, BIO_QUIET);
-
- for (bio = &ioend->io_inline_bio; bio; bio = next) {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * For the last bio, bi_private points to the ioend, so we
- * need to explicitly end the iteration here.
- */
- if (bio == last)
- next = NULL;
- else
- next = bio->bi_private;
-
- /* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, iter_all)
- xfs_finish_page_writeback(inode, bvec, error);
- bio_put(bio);
- }
-
- if (unlikely(error && !quiet)) {
- xfs_err_ratelimited(XFS_I(inode)->i_mount,
- "writeback error on sector %llu", start);
- }
-}
-
/*
* Fast and loose check if this write could update the on-disk inode size.
*/
-static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
{
return ioend->io_offset + ioend->io_size >
XFS_I(ioend->io_inode)->i_d.di_size;
@@ -127,7 +67,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
STATIC int
xfs_setfilesize_trans_alloc(
- struct xfs_ioend *ioend)
+ struct iomap_ioend *ioend)
{
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
struct xfs_trans *tp;
@@ -137,7 +77,7 @@ xfs_setfilesize_trans_alloc(
if (error)
return error;
- ioend->io_append_trans = tp;
+ ioend->io_private = tp;
/*
* We may pass freeze protection with a transaction. So tell lockdep
@@ -200,11 +140,11 @@ xfs_setfilesize(
STATIC int
xfs_setfilesize_ioend(
- struct xfs_ioend *ioend,
+ struct iomap_ioend *ioend,
int error)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_trans *tp = ioend->io_append_trans;
+ struct xfs_trans *tp = ioend->io_private;
/*
* The transaction may have been allocated in the I/O submission thread,
@@ -228,9 +168,8 @@ xfs_setfilesize_ioend(
*/
STATIC void
xfs_end_ioend(
- struct xfs_ioend *ioend)
+ struct iomap_ioend *ioend)
{
- struct list_head ioend_list;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
@@ -257,7 +196,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_fork == XFS_COW_FORK)
+ if (ioend->io_flags & IOMAP_F_SHARED)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
goto done;
}
@@ -265,154 +204,86 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_fork == XFS_COW_FORK)
+ if (ioend->io_flags & IOMAP_F_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_state == XFS_EXT_UNWRITTEN)
+ else if (ioend->io_type == IOMAP_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
else
- ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
+ ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
done:
- if (ioend->io_append_trans)
+ if (ioend->io_private)
error = xfs_setfilesize_ioend(ioend, error);
- list_replace_init(&ioend->io_list, &ioend_list);
- xfs_destroy_ioend(ioend, error);
-
- while (!list_empty(&ioend_list)) {
- ioend = list_first_entry(&ioend_list, struct xfs_ioend,
- io_list);
- list_del_init(&ioend->io_list);
- xfs_destroy_ioend(ioend, error);
- }
-
+ iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-xfs_ioend_can_merge(
- struct xfs_ioend *ioend,
- struct xfs_ioend *next)
-{
- if (ioend->io_bio->bi_status != next->io_bio->bi_status)
- return false;
- if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
- return false;
- if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
- (next->io_state == XFS_EXT_UNWRITTEN))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- return true;
-}
-
-/*
* If the to be merged ioend has a preallocated transaction for file
* size updates we need to ensure the ioend it is merged into also
* has one. If it already has one we can simply cancel the transaction
* as it is guaranteed to be clean.
*/
static void
-xfs_ioend_merge_append_transactions(
- struct xfs_ioend *ioend,
- struct xfs_ioend *next)
+xfs_ioend_merge_private(
+ struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
{
- if (!ioend->io_append_trans) {
- ioend->io_append_trans = next->io_append_trans;
- next->io_append_trans = NULL;
+ if (!ioend->io_private) {
+ ioend->io_private = next->io_private;
+ next->io_private = NULL;
} else {
xfs_setfilesize_ioend(next, -ECANCELED);
}
}
-/* Try to merge adjacent completions. */
-STATIC void
-xfs_ioend_try_merge(
- struct xfs_ioend *ioend,
- struct list_head *more_ioends)
-{
- struct xfs_ioend *next_ioend;
-
- while (!list_empty(more_ioends)) {
- next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
- io_list);
- if (!xfs_ioend_can_merge(ioend, next_ioend))
- break;
- list_move_tail(&next_ioend->io_list, &ioend->io_list);
- ioend->io_size += next_ioend->io_size;
- if (next_ioend->io_append_trans)
- xfs_ioend_merge_append_transactions(ioend, next_ioend);
- }
-}
-
-/* list_sort compare function for ioends */
-static int
-xfs_ioend_compare(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_ioend *ia;
- struct xfs_ioend *ib;
-
- ia = container_of(a, struct xfs_ioend, io_list);
- ib = container_of(b, struct xfs_ioend, io_list);
- if (ia->io_offset < ib->io_offset)
- return -1;
- else if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
/* Finish all pending io completions. */
void
xfs_end_io(
struct work_struct *work)
{
- struct xfs_inode *ip;
- struct xfs_ioend *ioend;
- struct list_head completion_list;
+ struct xfs_inode *ip =
+ container_of(work, struct xfs_inode, i_ioend_work);
+ struct iomap_ioend *ioend;
+ struct list_head tmp;
unsigned long flags;
- ip = container_of(work, struct xfs_inode, i_ioend_work);
-
spin_lock_irqsave(&ip->i_ioend_lock, flags);
- list_replace_init(&ip->i_ioend_list, &completion_list);
+ list_replace_init(&ip->i_ioend_list, &tmp);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
- list_sort(NULL, &completion_list, xfs_ioend_compare);
-
- while (!list_empty(&completion_list)) {
- ioend = list_first_entry(&completion_list, struct xfs_ioend,
- io_list);
+ iomap_sort_ioends(&tmp);
+ while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
+ io_list))) {
list_del_init(&ioend->io_list);
- xfs_ioend_try_merge(ioend, &completion_list);
+ iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private);
xfs_end_ioend(ioend);
}
}
+static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
+{
+ return ioend->io_private ||
+ ioend->io_type == IOMAP_UNWRITTEN ||
+ (ioend->io_flags & IOMAP_F_SHARED);
+}
+
STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct xfs_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = bio->bi_private;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
- if (ioend->io_fork == XFS_COW_FORK ||
- ioend->io_state == XFS_EXT_UNWRITTEN ||
- ioend->io_append_trans != NULL) {
- spin_lock_irqsave(&ip->i_ioend_lock, flags);
- if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
- &ip->i_ioend_work));
- list_add_tail(&ioend->io_list, &ip->i_ioend_list);
- spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
- } else
- xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
+ ASSERT(xfs_ioend_needs_workqueue(ioend));
+
+ spin_lock_irqsave(&ip->i_ioend_lock, flags);
+ if (list_empty(&ip->i_ioend_list))
+ WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ &ip->i_ioend_work));
+ list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+ spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
}
/*
@@ -421,19 +292,19 @@ xfs_end_bio(
*/
static bool
xfs_imap_valid(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb)
+ loff_t offset)
{
- if (offset_fsb < wpc->imap.br_startoff ||
- offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length)
return false;
/*
* If this is a COW mapping, it is sufficient to check that the mapping
* covers the offset. Be careful to check this first because the caller
* can revalidate a COW mapping without updating the data seqno.
*/
- if (wpc->fork == XFS_COW_FORK)
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
return true;
/*
@@ -443,17 +314,17 @@ xfs_imap_valid(
* checked (and found nothing at this offset) could have added
* overlapping blocks.
*/
- if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
return false;
if (xfs_inode_has_cow_data(ip) &&
- wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
return false;
return true;
}
/*
* Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->imap.
+ * extent that maps offset_fsb in wpc->iomap.
*
* The current page is held locked so nothing could have removed the block
* backing offset_fsb, although it could have moved from the COW to the data
@@ -461,32 +332,38 @@ xfs_imap_valid(
*/
static int
xfs_convert_blocks(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb)
+ int whichfork,
+ loff_t offset)
{
int error;
+ unsigned *seq;
+
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
/*
- * Attempt to allocate whatever delalloc extent currently backs
- * offset_fsb and put the result into wpc->imap. Allocate in a loop
- * because it may take several attempts to allocate real blocks for a
- * contiguous delalloc extent if free space is sufficiently fragmented.
+ * Attempt to allocate whatever delalloc extent currently backs offset
+ * and put the result into wpc->iomap. Allocate in a loop because it
+ * may take several attempts to allocate real blocks for a contiguous
+ * delalloc extent if free space is sufficiently fragmented.
*/
do {
- error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
- &wpc->imap, wpc->fork == XFS_COW_FORK ?
- &wpc->cow_seq : &wpc->data_seq);
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error)
return error;
- } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+ } while (wpc->iomap.offset + wpc->iomap.length <= offset);
return 0;
}
-STATIC int
+static int
xfs_map_blocks(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct inode *inode,
loff_t offset)
{
@@ -496,6 +373,7 @@ xfs_map_blocks(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
+ int whichfork = XFS_DATA_FORK;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
@@ -519,7 +397,7 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- if (xfs_imap_valid(wpc, ip, offset_fsb))
+ if (xfs_imap_valid(wpc, ip, offset))
return 0;
/*
@@ -541,10 +419,10 @@ retry:
xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
cow_fsb = imap.br_startoff;
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
- wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
+ XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- wpc->fork = XFS_COW_FORK;
+ whichfork = XFS_COW_FORK;
goto allocate_blocks;
}
@@ -552,7 +430,7 @@ retry:
* No COW extent overlap. Revalidate now that we may have updated
* ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (xfs_imap_valid(wpc, ip, offset_fsb)) {
+ if (xfs_imap_valid(wpc, ip, offset)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -564,11 +442,9 @@ retry:
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
- wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
+ XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- wpc->fork = XFS_DATA_FORK;
-
/* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
imap.br_blockcount = imap.br_startoff - offset_fsb;
@@ -592,11 +468,11 @@ retry:
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- wpc->imap = imap;
- trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
+ trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ error = xfs_convert_blocks(wpc, ip, whichfork, offset);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
@@ -605,7 +481,7 @@ allocate_blocks:
* the former case, but prevent additional retries to avoid
* looping forever for the latter case.
*/
- if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
goto retry;
ASSERT(error != -EAGAIN);
return error;
@@ -616,34 +492,22 @@ allocate_blocks:
* original delalloc one. Trim the return extent to the next COW
* boundary again to force a re-lookup.
*/
- if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
- cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
- wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+ if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
+ loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
+
+ if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
+ wpc->iomap.length = cow_offset - wpc->iomap.offset;
+ }
- ASSERT(wpc->imap.br_startoff <= offset_fsb);
- ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
+ ASSERT(wpc->iomap.offset <= offset);
+ ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
+ trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
return 0;
}
-/*
- * Submit the bio for an ioend. We are passed an ioend with a bio attached to
- * it, and we submit that bio. The ioend may be used for multiple bio
- * submissions, so we only want to allocate an append transaction for the ioend
- * once. In the case of multiple bio submission, each bio will take an IO
- * reference to the ioend to ensure that the ioend completion is only done once
- * all bios have been submitted and the ioend is really done.
- *
- * If @status is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the bio and ioend
- * rather than submit it to IO. This typically only happens on a filesystem
- * shutdown.
- */
-STATIC int
-xfs_submit_ioend(
- struct writeback_control *wbc,
- struct xfs_ioend *ioend,
+static int
+xfs_prepare_ioend(
+ struct iomap_ioend *ioend,
int status)
{
unsigned int nofs_flag;
@@ -656,157 +520,24 @@ xfs_submit_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && ioend->io_fork == XFS_COW_FORK) {
+ if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- (ioend->io_fork == XFS_COW_FORK ||
- ioend->io_state != XFS_EXT_UNWRITTEN) &&
+ ((ioend->io_flags & IOMAP_F_SHARED) ||
+ ioend->io_type != IOMAP_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
- !ioend->io_append_trans)
+ !ioend->io_private)
status = xfs_setfilesize_trans_alloc(ioend);
memalloc_nofs_restore(nofs_flag);
- ioend->io_bio->bi_private = ioend;
- ioend->io_bio->bi_end_io = xfs_end_bio;
-
- /*
- * If we are failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- if (status) {
- ioend->io_bio->bi_status = errno_to_blk_status(status);
- bio_endio(ioend->io_bio);
- return status;
- }
-
- submit_bio(ioend->io_bio);
- return 0;
-}
-
-static struct xfs_ioend *
-xfs_alloc_ioend(
- struct inode *inode,
- int fork,
- xfs_exntst_t state,
- xfs_off_t offset,
- struct block_device *bdev,
- sector_t sector,
- struct writeback_control *wbc)
-{
- struct xfs_ioend *ioend;
- struct bio *bio;
-
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = sector;
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- bio->bi_write_hint = inode->i_write_hint;
- wbc_init_bio(wbc, bio);
-
- ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_fork = fork;
- ioend->io_state = state;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = offset;
- ioend->io_append_trans = NULL;
- ioend->io_bio = bio;
- return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to do perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in xfs_destroy_ioend().
- */
-static struct bio *
-xfs_chain_bio(
- struct bio *prev)
-{
- struct bio *new;
-
- new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
- bio_copy_dev(new, prev);/* also copies over blkcg information */
- new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
- new->bi_write_hint = prev->bi_write_hint;
-
- bio_chain(prev, new);
- bio_get(prev); /* for xfs_destroy_ioend */
- submit_bio(prev);
- return new;
-}
-
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first, otherwise finish off the current ioend and start another.
- */
-STATIC void
-xfs_add_to_ioend(
- struct inode *inode,
- xfs_off_t offset,
- struct page *page,
- struct iomap_page *iop,
- struct xfs_writepage_ctx *wpc,
- struct writeback_control *wbc,
- struct list_head *iolist)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct block_device *bdev = xfs_find_bdev_for_inode(inode);
- unsigned len = i_blocksize(inode);
- unsigned poff = offset & (PAGE_SIZE - 1);
- bool merged, same_page = false;
- sector_t sector;
-
- sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
- ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
-
- if (!wpc->ioend ||
- wpc->fork != wpc->ioend->io_fork ||
- wpc->imap.br_state != wpc->ioend->io_state ||
- sector != bio_end_sector(wpc->ioend->io_bio) ||
- offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
- if (wpc->ioend)
- list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
- wpc->imap.br_state, offset, bdev, sector, wbc);
- }
-
- merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
- &same_page);
-
- if (iop && !same_page)
- atomic_inc(&iop->write_count);
-
- if (!merged) {
- if (bio_full(wpc->ioend->io_bio, len))
- wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
- bio_add_page(wpc->ioend->io_bio, page, len, poff);
- }
-
- wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, page, len);
-}
-
-STATIC void
-xfs_vm_invalidatepage(
- struct page *page,
- unsigned int offset,
- unsigned int length)
-{
- trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
- iomap_invalidatepage(page, offset, length);
+ if (xfs_ioend_needs_workqueue(ioend))
+ ioend->io_bio->bi_end_io = xfs_end_bio;
+ return status;
}
/*
@@ -820,8 +551,8 @@ xfs_vm_invalidatepage(
* transaction as there is no space left for block reservation (typically why we
* see a ENOSPC in writeback).
*/
-STATIC void
-xfs_aops_discard_page(
+static void
+xfs_discard_page(
struct page *page)
{
struct inode *inode = page->mapping->host;
@@ -843,246 +574,14 @@ xfs_aops_discard_page(
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
- xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
+ iomap_invalidatepage(page, 0, PAGE_SIZE);
}
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * forward progress guarantees we need to provide. The current ioend we are
- * adding blocks to is cached on the writepage context, and if the new block
- * does not append to the cached ioend it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected. While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-xfs_writepage_map(
- struct xfs_writepage_ctx *wpc,
- struct writeback_control *wbc,
- struct inode *inode,
- struct page *page,
- uint64_t end_offset)
-{
- LIST_HEAD(submit_list);
- struct iomap_page *iop = to_iomap_page(page);
- unsigned len = i_blocksize(inode);
- struct xfs_ioend *ioend, *next;
- uint64_t file_offset; /* file offset of page */
- int error = 0, count = 0, i;
-
- ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
- ASSERT(!iop || atomic_read(&iop->write_count) == 0);
-
- /*
- * Walk through the page to find areas to write back. If we run off the
- * end of the current map or find the current map invalid, grab a new
- * one.
- */
- for (i = 0, file_offset = page_offset(page);
- i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
- i++, file_offset += len) {
- if (iop && !test_bit(i, iop->uptodate))
- continue;
-
- error = xfs_map_blocks(wpc, inode, file_offset);
- if (error)
- break;
- if (wpc->imap.br_startblock == HOLESTARTBLOCK)
- continue;
- xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
- &submit_list);
- count++;
- }
-
- ASSERT(wpc->ioend || list_empty(&submit_list));
- ASSERT(PageLocked(page));
- ASSERT(!PageWriteback(page));
-
- /*
- * On error, we have to fail the ioend here because we may have set
- * pages under writeback, we have to make sure we run IO completion to
- * mark the error state of the IO appropriately, so we can't cancel the
- * ioend directly here. That means we have to mark this page as under
- * writeback if we included any blocks from it in the ioend chain so
- * that completion treats it correctly.
- *
- * If we didn't include the page in the ioend, the on error we can
- * simply discard and unlock it as there are no other users of the page
- * now. The caller will still need to trigger submission of outstanding
- * ioends on the writepage context so they are treated correctly on
- * error.
- */
- if (unlikely(error)) {
- if (!count) {
- xfs_aops_discard_page(page);
- ClearPageUptodate(page);
- unlock_page(page);
- goto done;
- }
-
- /*
- * If the page was not fully cleaned, we need to ensure that the
- * higher layers come back to it correctly. That means we need
- * to keep the page dirty, and for WB_SYNC_ALL writeback we need
- * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
- * so another attempt to write this page in this writeback sweep
- * will be made.
- */
- set_page_writeback_keepwrite(page);
- } else {
- clear_page_dirty_for_io(page);
- set_page_writeback(page);
- }
-
- unlock_page(page);
-
- /*
- * Preserve the original error if there was one, otherwise catch
- * submission errors here and propagate into subsequent ioend
- * submissions.
- */
- list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
- int error2;
-
- list_del_init(&ioend->io_list);
- error2 = xfs_submit_ioend(wbc, ioend, error);
- if (error2 && !error)
- error = error2;
- }
-
- /*
- * We can end up here with no error and nothing to write only if we race
- * with a partial page truncate on a sub-page block sized filesystem.
- */
- if (!count)
- end_page_writeback(page);
-done:
- mapping_set_error(page->mapping, error);
- return error;
-}
-
-/*
- * Write out a dirty page.
- *
- * For delalloc space on the page we need to allocate space and flush it.
- * For unwritten space on the page we need to start the conversion to
- * regular allocated space.
- */
-STATIC int
-xfs_do_writepage(
- struct page *page,
- struct writeback_control *wbc,
- void *data)
-{
- struct xfs_writepage_ctx *wpc = data;
- struct inode *inode = page->mapping->host;
- loff_t offset;
- uint64_t end_offset;
- pgoff_t end_index;
-
- trace_xfs_writepage(inode, page, 0, 0);
-
- /*
- * Refuse to write the page out if we are called from reclaim context.
- *
- * This avoids stack overflows when called from deeply used stacks in
- * random callers for direct reclaim or memcg reclaim. We explicitly
- * allow reclaim from kswapd as the stack usage there is relatively low.
- *
- * This should never happen except in the case of a VM regression so
- * warn about it.
- */
- if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC))
- goto redirty;
-
- /*
- * Given that we do not allow direct reclaim to call us, we should
- * never be called while in a filesystem transaction.
- */
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
- goto redirty;
-
- /*
- * Is this page beyond the end of the file?
- *
- * The page index is less than the end_index, adjust the end_offset
- * to the highest offset that this page should represent.
- * -----------------------------------------------------
- * | file mapping | <EOF> |
- * -----------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | |
- * ^--------------------------------^----------|--------
- * | desired writeback range | see else |
- * ---------------------------------^------------------|
- */
- offset = i_size_read(inode);
- end_index = offset >> PAGE_SHIFT;
- if (page->index < end_index)
- end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
- else {
- /*
- * Check whether the page to write out is beyond or straddles
- * i_size or not.
- * -------------------------------------------------------
- * | file mapping | <EOF> |
- * -------------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
- * ^--------------------------------^-----------|---------
- * | | Straddles |
- * ---------------------------------^-----------|--------|
- */
- unsigned offset_into_page = offset & (PAGE_SIZE - 1);
-
- /*
- * Skip the page if it is fully outside i_size, e.g. due to a
- * truncate operation that is in progress. We must redirty the
- * page so that reclaim stops reclaiming it. Otherwise
- * xfs_vm_releasepage() is called on it and gets confused.
- *
- * Note that the end_index is unsigned long, it would overflow
- * if the given offset is greater than 16TB on 32-bit system
- * and if we do check the page is fully outside i_size or not
- * via "if (page->index >= end_index + 1)" as "end_index + 1"
- * will be evaluated to 0. Hence this page will be redirtied
- * and be written out repeatedly which would result in an
- * infinite loop, the user program that perform this operation
- * will hang. Instead, we can verify this situation by checking
- * if the page to write is totally beyond the i_size or if it's
- * offset is just equal to the EOF.
- */
- if (page->index > end_index ||
- (page->index == end_index && offset_into_page == 0))
- goto redirty;
-
- /*
- * The page straddles i_size. It must be zeroed out on each
- * and every writepage invocation because it may be mmapped.
- * "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
- * memory is zeroed when mapped, and writes to that region are
- * not written out to the file."
- */
- zero_user_segment(page, offset_into_page, PAGE_SIZE);
-
- /* Adjust the end_offset to the end of file */
- end_offset = offset;
- }
-
- return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
-
-redirty:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-}
+static const struct iomap_writeback_ops xfs_writeback_ops = {
+ .map_blocks = xfs_map_blocks,
+ .prepare_ioend = xfs_prepare_ioend,
+ .discard_page = xfs_discard_page,
+};
STATIC int
xfs_vm_writepage(
@@ -1090,12 +589,8 @@ xfs_vm_writepage(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
- int ret;
- ret = xfs_do_writepage(page, wbc, &wpc);
- if (wpc.ioend)
- ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
- return ret;
+ return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
@@ -1104,13 +599,9 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
- int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
- if (wpc.ioend)
- ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
- return ret;
+ return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
@@ -1123,15 +614,6 @@ xfs_dax_writepages(
xfs_find_bdev_for_inode(mapping->host), wbc);
}
-STATIC int
-xfs_vm_releasepage(
- struct page *page,
- gfp_t gfp_mask)
-{
- trace_xfs_releasepage(page->mapping->host, page, 0, 0);
- return iomap_releasepage(page, gfp_mask);
-}
-
STATIC sector_t
xfs_vm_bmap(
struct address_space *mapping,
@@ -1160,7 +642,6 @@ xfs_vm_readpage(
struct file *unused,
struct page *page)
{
- trace_xfs_vm_readpage(page->mapping->host, 1);
return iomap_readpage(page, &xfs_iomap_ops);
}
@@ -1171,7 +652,6 @@ xfs_vm_readpages(
struct list_head *pages,
unsigned nr_pages)
{
- trace_xfs_vm_readpages(mapping->host, nr_pages);
return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
}
@@ -1191,8 +671,8 @@ const struct address_space_operations xfs_address_space_operations = {
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
.set_page_dirty = iomap_set_page_dirty,
- .releasepage = xfs_vm_releasepage,
- .invalidatepage = xfs_vm_invalidatepage,
+ .releasepage = iomap_releasepage,
+ .invalidatepage = iomap_invalidatepage,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 45a1ea240cbb..687b11f34fa2 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -6,23 +6,6 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct bio_set xfs_ioend_bioset;
-
-/*
- * Structure for buffered I/O completions.
- */
-struct xfs_ioend {
- struct list_head io_list; /* next ioend in chain */
- int io_fork; /* inode fork written back */
- xfs_exntst_t io_state; /* extent state */
- struct inode *io_inode; /* file being written to */
- size_t io_size; /* size of the extent */
- xfs_off_t io_offset; /* offset in the file */
- struct xfs_trans *io_append_trans;/* xact. for size update */
- struct bio *io_bio; /* bio being built */
- struct bio io_inline_bio; /* MUST BE LAST! */
-};
-
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1ffb179f35d2..c0620135a279 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -188,7 +188,7 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+ ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL, is_sync_kiocb(iocb));
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -547,15 +547,12 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
- ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
-
/*
- * If unaligned, this is the only IO in-flight. If it has not yet
- * completed, wait on it before we release the iolock to prevent
- * subsequent overlapping IO.
+ * If unaligned, this is the only IO in-flight. Wait on it before we
+ * release the iolock to prevent subsequent overlapping IO.
*/
- if (ret == -EIOCBQUEUED && unaligned_io)
- inode_dio_wait(inode);
+ ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_io);
out:
xfs_iunlock(ip, iolock);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f780e223b118..95719e161286 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -54,7 +54,7 @@ xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
- bool shared)
+ u16 flags)
{
struct xfs_mount *mp = ip->i_mount;
@@ -79,12 +79,11 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+ iomap->flags = flags;
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
return 0;
}
@@ -540,6 +539,7 @@ xfs_file_iomap_begin_delay(
struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
bool eof = false, cow_eof = false, shared = false;
+ u16 iomap_flags = 0;
int whichfork = XFS_DATA_FORK;
int error = 0;
@@ -707,22 +707,28 @@ retry:
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
- iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, whichfork,
- whichfork == XFS_DATA_FORK ? &imap : &cmap);
+ if (whichfork == XFS_DATA_FORK) {
+ iomap_flags |= IOMAP_F_NEW;
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork, &imap);
+ } else {
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork, &cmap);
+ }
done:
if (whichfork == XFS_COW_FORK) {
if (imap.br_startoff > offset_fsb) {
xfs_trim_extent(&cmap, offset_fsb,
imap.br_startoff - offset_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap,
+ IOMAP_F_SHARED);
goto out_unlock;
}
/* ensure we only report blocks we have a reservation for */
xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
shared = true;
}
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+ if (shared)
+ iomap_flags |= IOMAP_F_SHARED;
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -922,7 +928,8 @@ xfs_file_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -930,6 +937,7 @@ xfs_file_iomap_begin(
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
bool shared = false;
+ u16 iomap_flags = 0;
unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
@@ -1045,11 +1053,20 @@ xfs_file_iomap_begin(
if (error)
return error;
- iomap->flags |= IOMAP_F_NEW;
+ iomap_flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+ /*
+ * Writes that span EOF might trigger an IO size update on completion,
+ * so consider them to be dirty for the purposes of O_DSYNC even if
+ * there is no other metadata changes pending or have been made here.
+ */
+ if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode))
+ iomap_flags |= IOMAP_F_DIRTY;
+ if (shared)
+ iomap_flags |= IOMAP_F_SHARED;
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
out_found:
ASSERT(nimaps);
@@ -1145,7 +1162,8 @@ xfs_seek_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1193,7 +1211,7 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1215,7 +1233,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1231,7 +1249,8 @@ xfs_xattr_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1261,7 +1280,7 @@ out_unlock:
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 5c2f6aa6d78f..71d0ae460c44 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -16,7 +16,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *, bool shared);
+ struct xfs_bmbt_irec *, u16);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index a339bd5fa260..9c96493be9e0 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -178,7 +178,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 0f08153b4994..a9634110c783 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1442,7 +1442,7 @@ xfs_reflink_dirty_extents(
flen = XFS_FSB_TO_B(mp, rlen);
if (fpos + flen > isize)
flen = isize - fpos;
- error = iomap_file_dirty(VFS_I(ip), fpos, flen,
+ error = iomap_file_unshare(VFS_I(ip), fpos, flen,
&xfs_iomap_ops);
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8d1df9f8be07..0a8cf6b87a21 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -40,7 +40,6 @@
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
-struct bio_set xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
@@ -1853,15 +1852,10 @@ MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
{
- if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct xfs_ioend, io_inline_bio),
- BIOSET_NEED_BVECS))
- goto out;
-
xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
"xfs_log_ticket");
if (!xfs_log_ticket_zone)
- goto out_free_ioend_bioset;
+ goto out;
xfs_bmap_free_item_zone = kmem_zone_init(
sizeof(struct xfs_extent_free_item),
@@ -1996,8 +1990,6 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_bmap_free_item_zone);
out_destroy_log_ticket_zone:
kmem_zone_destroy(xfs_log_ticket_zone);
- out_free_ioend_bioset:
- bioset_exit(&xfs_ioend_bioset);
out:
return -ENOMEM;
}
@@ -2028,7 +2020,6 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
kmem_zone_destroy(xfs_log_ticket_zone);
- bioset_exit(&xfs_ioend_bioset);
}
STATIC int __init
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index eaae275ed430..cbb23d7a3554 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1158,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
-DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
- unsigned int len),
- TP_ARGS(inode, page, off, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(pgoff_t, pgoff)
- __field(loff_t, size)
- __field(unsigned long, offset)
- __field(unsigned int, length)
- ),
- TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = XFS_I(inode)->i_ino;
- __entry->pgoff = page_offset(page);
- __entry->size = i_size_read(inode);
- __entry->offset = off;
- __entry->length = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "length %x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->pgoff,
- __entry->size,
- __entry->offset,
- __entry->length)
-)
-
-#define DEFINE_PAGE_EVENT(name) \
-DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
- unsigned int len), \
- TP_ARGS(inode, page, off, len))
-DEFINE_PAGE_EVENT(xfs_writepage);
-DEFINE_PAGE_EVENT(xfs_releasepage);
-DEFINE_PAGE_EVENT(xfs_invalidatepage);
-
-DECLARE_EVENT_CLASS(xfs_readpage_class,
- TP_PROTO(struct inode *inode, int nr_pages),
- TP_ARGS(inode, nr_pages),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(int, nr_pages)
- ),
- TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = inode->i_ino;
- __entry->nr_pages = nr_pages;
- ),
- TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->nr_pages)
-)
-
-#define DEFINE_READPAGE_EVENT(name) \
-DEFINE_EVENT(xfs_readpage_class, name, \
- TP_PROTO(struct inode *inode, int nr_pages), \
- TP_ARGS(inode, nr_pages))
-DEFINE_READPAGE_EVENT(xfs_vm_readpage);
-DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),
OpenPOWER on IntegriCloud